In [105]:
# Import required libraries
install.packages("GGally")
install.packages("matrixStats")
install.packages("readxl")
install.packages("lubridate")
install.packages("writexl")
library(readxl)
library(forecast)
library(data.table)
library(lubridate)
library(dplyr)
library(tidyr)
library(GGally)
library(dplyr)
library(writexl)
library(matrixStats)
The downloaded binary packages are in
	/var/folders/tm/h12dzwn519q9gwvz38d0m4_80000gn/T//RtmpqbESn6/downloaded_packages

The downloaded binary packages are in
	/var/folders/tm/h12dzwn519q9gwvz38d0m4_80000gn/T//RtmpqbESn6/downloaded_packages

The downloaded binary packages are in
	/var/folders/tm/h12dzwn519q9gwvz38d0m4_80000gn/T//RtmpqbESn6/downloaded_packages

The downloaded binary packages are in
	/var/folders/tm/h12dzwn519q9gwvz38d0m4_80000gn/T//RtmpqbESn6/downloaded_packages

The downloaded binary packages are in
	/var/folders/tm/h12dzwn519q9gwvz38d0m4_80000gn/T//RtmpqbESn6/downloaded_packages
In [95]:
tday=today("Turkey")
day_before_tday <- tday - 1
#day_before_tday <- tday - 2
prediction_day <- tday +1
start_date <- as.Date("2024-02-01")
end_date <- as.Date("2024-05-15")
In [3]:
file_weather = paste0("/Users/ecemozturk/Desktop/processed_weather.csv")
file_production = paste0("/Users/ecemozturk/Desktop/production-4.csv")
weather_data = fread(file_weather)
production_data = fread(file_production)
In [4]:
# getting full weather date and hours as a template
template_dt = unique(weather_data[,list(date,hour)])
template_dt = merge(template_dt,production_data,by=c('date','hour'),all.x=T)
#template_dt = template_dt[date<=(tday + 1)]
template_dt = template_dt[date<=(tday)]
In [5]:
###NA VALUES###
any_na <- anyNA(weather_data)
if (any_na) {
  cat("The dataset contains NA values.\n")
  # Display the count of NAs per column
  print(colSums(is.na(weather_data)))
} else {
  cat("The dataset does not contain any NA values.\n")
}
# Display all rows that have NA values
na_rows <- weather_data[!complete.cases(weather_data), ]
View(na_rows)

# Fill NA values with the average of the surrounding values (linear interpolation)
merged_data_filled <- weather_data %>%
  mutate(across(where(is.numeric), ~ na.approx(.x, na.rm = FALSE)))

# Fill leading NAs with the next available value, upward
merged_data_filled <- merged_data_filled %>%
  mutate(across(where(is.numeric), ~ na.locf(.x, fromLast = TRUE, na.rm = FALSE)))

weather_data<- merged_data_filled
The dataset contains NA values.
                   date                    hour                     lat 
                      0                       0                       0 
                    lon           dswrf_surface    tcdc_low.cloud.layer 
                      0                      25                      50 
tcdc_middle.cloud.layer   tcdc_high.cloud.layer  tcdc_entire.atmosphere 
                     50                     100                      63 
uswrf_top_of_atmosphere           csnow_surface           dlwrf_surface 
                     75                      25                      25 
          uswrf_surface             tmp_surface 
                     50                      25 
A data.table: 213 × 14
datehourlatlondswrf_surfacetcdc_low.cloud.layertcdc_middle.cloud.layertcdc_high.cloud.layertcdc_entire.atmosphereuswrf_top_of_atmospherecsnow_surfacedlwrf_surfaceuswrf_surfacetmp_surface
<IDate><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><int><dbl><dbl><dbl>
2022-01-132337.7535.000 56.090.1 NA 98.100223.9000262.257
2022-01-132338.7535.250100.010.0 NA100.000250.2000271.157
2022-01-132337.7534.500 36.585.5 NA 98.900242.4000270.657
2022-01-132338.5035.250 29.951.2 NA 94.600226.5000266.857
2022-01-132337.7535.500 46.799.8 NA 99.900253.2000267.357
2022-01-132338.7535.500100.0 8.7 NA100.000236.0000270.857
2022-01-132338.0035.250 99.276.1 NA100.001230.2000259.557
2022-01-132338.2535.500 99.575.0 NA 99.801267.3000268.257
2022-01-132338.2535.250100.068.7 NA100.001295.2000271.657
2022-01-132337.7535.250 62.986.2 NA 90.700233.2000262.957
2022-01-132338.2535.000 99.947.3 NA100.001268.2000269.357
2022-01-132338.0035.500 87.798.1 NA 99.101255.5000265.857
2022-01-132338.0035.000 99.663.7 NA100.001258.4000263.857
2022-01-132337.7534.750 10.693.3 NA 97.000229.8000265.857
2022-01-132338.5034.500100.0 9.7 NA100.001240.1000267.657
2022-01-132338.5034.750 98.7 9.3 NA100.001236.9000266.957
2022-01-132338.5035.000 99.4 7.0 NA 99.600233.1000267.057
2022-01-132338.5035.500 98.551.7 NA100.001223.6000259.357
2022-01-132338.7534.500 98.0 0.0 NA 98.200238.7000270.457
2022-01-132338.2534.500100.0 5.0 NA100.001249.1000266.257
2022-01-132338.7535.000 95.618.7 NA 96.101242.4000270.357
2022-01-132338.0034.750 96.847.7 NA100.001247.4000265.657
2022-01-132338.7534.750 85.2 2.2 NA 86.100239.3000270.657
2022-01-132338.2534.750100.0 9.8 NA100.001254.9000268.657
2022-01-132338.0034.500 6.356.0 NA100.000221.7000265.157
2022-01-27 338.7534.500 NA NA 39.1100.001230.4870257.784
2022-01-27 338.2535.500 NA NA100.0100.001248.8870258.784
2022-01-27 338.0035.500 NA NA100.0100.000245.9870258.984
2022-01-27 337.7534.750 NA NA100.0100.000224.2870257.284
2022-01-27 338.5034.500 NA NA 79.0100.001224.3870256.084
⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮
2022-03-23 737.7534.50 0.56 9.6 3.5 0.0 NA0.8000205.7270.192268.461
2022-03-23 738.7534.50 0.56 5.7 2.1 0.0 NA0.7360205.1270.144269.361
2022-03-23 738.2534.75 0.7219.140.5 0.0 NA1.2160193.5270.528258.261
2022-03-23 738.5034.50 0.60 6.613.9 0.0 NA1.0240190.8270.432256.561
2022-03-23 738.0034.50 0.6010.340.9 0.0 NA0.8960198.3270.304261.761
2022-04-041038.7535.00544.02 0.0 0.0 97.6 98.3 NA0266.397 NA291.531
2022-04-041037.7535.50579.18 0.0 0.0 72.6 80.7 NA0269.997 NA293.231
2022-04-041037.7535.00615.56 0.0 0.0 5.0 10.8 NA0238.297 NA283.331
2022-04-041037.7535.25614.90 0.0 0.0 20.0 26.5 NA0237.597 NA286.931
2022-04-041038.0035.25616.42 0.0 0.0 5.0 10.5 NA0232.197 NA282.231
2022-04-041038.5035.50612.84 0.0 0.0 9.1 12.2 NA0235.097 NA277.831
2022-04-041038.2534.50590.28 0.0 0.0 10.1 16.6 NA0251.497 NA288.831
2022-04-041038.2535.00591.38 0.0 0.0 33.0 37.2 NA0260.197 NA292.231
2022-04-041038.5035.00587.14 0.0 0.0 48.6 52.8 NA0255.197 NA289.131
2022-04-041038.2535.25588.62 0.0 0.0 6.0 9.9 NA0268.997 NA292.131
2022-04-041038.5034.50581.62 0.0 0.0 37.1 41.2 NA0257.197 NA289.731
2022-04-041038.7535.50546.76 0.0 0.0 85.2 88.5 NA0269.097 NA291.931
2022-04-041038.0035.00603.96 0.0 0.0 5.0 10.8 NA0243.897 NA286.031
2022-04-041038.7534.75555.22 0.0 0.0 92.4 93.6 NA0264.697 NA292.331
2022-04-041038.7534.50548.94 0.0 0.0 99.5 99.6 NA0265.897 NA292.531
2022-04-041038.2535.50592.34 0.0 0.0 5.0 9.2 NA0261.197 NA291.331
2022-04-041037.7534.75601.20 0.0 0.0 7.6 12.4 NA0254.597 NA288.431
2022-04-041038.7535.25531.68 0.0 0.0100.0100.0 NA0270.297 NA291.531
2022-04-041038.2534.75589.52 0.0 0.0 46.7 52.7 NA0258.097 NA292.631
2022-04-041038.5034.75585.10 0.0 0.0 52.9 57.4 NA0253.097 NA289.231
2022-04-041038.5035.25591.58 0.0 0.0 8.7 14.5 NA0255.897 NA290.131
2022-04-041038.0034.75596.30 0.0 0.0 5.0 11.1 NA0252.797 NA288.931
2022-04-041038.0034.50594.70 0.0 0.0 5.7 12.6 NA0251.297 NA288.131
2022-04-041038.0035.50592.68 0.0 0.0 70.4 77.8 NA0256.497 NA290.531
2022-04-041037.7534.50587.90 0.0 0.0 17.8 23.8 NA0269.797 NA294.931
Error in `mutate()`:
ℹ In argument: `across(where(is.numeric), ~na.approx(.x, na.rm =
  FALSE))`.
Caused by error in `across()`:
! Can't compute column `hour`.
Caused by error in `na.approx()`:
! "na.approx" fonksiyonu bulunamadı
Traceback:

1. weather_data %>% mutate(across(where(is.numeric), ~na.approx(.x, 
 .     na.rm = FALSE)))
2. mutate(., across(where(is.numeric), ~na.approx(.x, na.rm = FALSE)))
3. mutate.data.frame(., across(where(is.numeric), ~na.approx(.x, 
 .     na.rm = FALSE)))
4. mutate_cols(.data, dplyr_quosures(...), by)
5. withCallingHandlers(for (i in seq_along(dots)) {
 .     poke_error_context(dots, i, mask = mask)
 .     context_poke("column", old_current_column)
 .     new_columns <- mutate_col(dots[[i]], data, mask, new_columns)
 . }, error = dplyr_error_handler(dots = dots, mask = mask, bullets = mutate_bullets, 
 .     error_call = error_call, error_class = "dplyr:::mutate_error"), 
 .     warning = dplyr_warning_handler(state = warnings_state, mask = mask, 
 .         error_call = error_call))
6. mutate_col(dots[[i]], data, mask, new_columns)
7. withCallingHandlers(mask$eval_all_mutate(quo), error = function(cnd) {
 .     name <- dplyr_quosure_name(quo_data)
 .     msg <- glue("Can't compute column `{name}`.")
 .     abort(msg, call = call("across"), parent = cnd)
 . })
8. mask$eval_all_mutate(quo)
9. eval()
10. .handleSimpleError(function (cnd) 
  . {
  .     name <- dplyr_quosure_name(quo_data)
  .     msg <- glue("Can't compute column `{name}`.")
  .     abort(msg, call = call("across"), parent = cnd)
  . }, "\"na.approx\" fonksiyonu bulunamadı", base::quote(na.approx(hour, 
  .     na.rm = FALSE)))
11. h(simpleError(msg, call))
12. abort(msg, call = call("across"), parent = cnd)
13. signal_abort(cnd, .file)
14. signalCondition(cnd)
15. (function (cnd) 
  . {
  .     local_error_context(dots, i = frame[[i_sym]], mask = mask)
  .     if (inherits(cnd, "dplyr:::internal_error")) {
  .         parent <- error_cnd(message = bullets(cnd))
  .     }
  .     else {
  .         parent <- cnd
  .     }
  .     message <- c(cnd_bullet_header(action), i = if (has_active_group_context(mask)) cnd_bullet_cur_group_label())
  .     abort(message, class = error_class, parent = parent, call = error_call)
  . })(structure(list(message = structure("Can't compute column `hour`.", class = c("glue", 
  . "character")), trace = structure(list(call = list(IRkernel::main(), 
  .     kernel$run(), handle_shell(), executor$execute(msg), tryCatch(evaluate(request$content$code, 
  .         envir = .GlobalEnv, output_handler = oh, stop_on_error = 1L), 
  .         interrupt = function(cond) {
  .             log_debug("Interrupt during execution")
  .             interrupted <<- TRUE
  .         }, error = .self$handle_error), tryCatchList(expr, classes, 
  .         parentenv, handlers), tryCatchOne(tryCatchList(expr, 
  .         names[-nh], parentenv, handlers[-nh]), names[nh], parentenv, 
  .         handlers[[nh]]), doTryCatch(return(expr), name, parentenv, 
  .         handler), tryCatchList(expr, names[-nh], parentenv, handlers[-nh]), 
  .     tryCatchOne(expr, names, parentenv, handlers[[1L]]), doTryCatch(return(expr), 
  .         name, parentenv, handler), evaluate(request$content$code, 
  .         envir = .GlobalEnv, output_handler = oh, stop_on_error = 1L), 
  .     evaluate_call(expr, parsed$src[[i]], envir = envir, enclos = enclos, 
  .         debug = debug, last = i == length(out), use_try = stop_on_error != 
  .             2L, keep_warning = keep_warning, keep_message = keep_message, 
  .         log_echo = log_echo, log_warning = log_warning, output_handler = output_handler, 
  .         include_timing = include_timing), timing_fn(handle(ev <- withCallingHandlers(withVisible(eval_with_user_handlers(expr, 
  .         envir, enclos, user_handlers)), warning = wHandler, error = eHandler, 
  .         message = mHandler))), handle(ev <- withCallingHandlers(withVisible(eval_with_user_handlers(expr, 
  .         envir, enclos, user_handlers)), warning = wHandler, error = eHandler, 
  .         message = mHandler)), try(f, silent = TRUE), tryCatch(expr, 
  .         error = function(e) {
  .             call <- conditionCall(e)
  .             if (!is.null(call)) {
  .                 if (identical(call[[1L]], quote(doTryCatch))) 
  .                   call <- sys.call(-4L)
  .                 dcall <- deparse(call, nlines = 1L)
  .                 prefix <- paste("Error in", dcall, ": ")
  .                 LONG <- 75L
  .                 sm <- strsplit(conditionMessage(e), "\n")[[1L]]
  .                 w <- 14L + nchar(dcall, type = "w") + nchar(sm[1L], 
  .                   type = "w")
  .                 if (is.na(w)) 
  .                   w <- 14L + nchar(dcall, type = "b") + nchar(sm[1L], 
  .                     type = "b")
  .                 if (w > LONG) 
  .                   prefix <- paste0(prefix, "\n  ")
  .             }
  .             else prefix <- "Error : "
  .             msg <- paste0(prefix, conditionMessage(e), "\n")
  .             .Internal(seterrmessage(msg[1L]))
  .             if (!silent && isTRUE(getOption("show.error.messages"))) {
  .                 cat(msg, file = outFile)
  .                 .Internal(printDeferredWarnings())
  .             }
  .             invisible(structure(msg, class = "try-error", condition = e))
  .         }), tryCatchList(expr, classes, parentenv, handlers), 
  .     tryCatchOne(expr, names, parentenv, handlers[[1L]]), doTryCatch(return(expr), 
  .         name, parentenv, handler), withCallingHandlers(withVisible(eval_with_user_handlers(expr, 
  .         envir, enclos, user_handlers)), warning = wHandler, error = eHandler, 
  .         message = mHandler), withVisible(eval_with_user_handlers(expr, 
  .         envir, enclos, user_handlers)), eval_with_user_handlers(expr, 
  .         envir, enclos, user_handlers), eval(expr, envir, enclos), 
  .     eval(expr, envir, enclos), weather_data %>% mutate(across(where(is.numeric), 
  .         ~na.approx(.x, na.rm = FALSE))), mutate(., across(where(is.numeric), 
  .         ~na.approx(.x, na.rm = FALSE))), mutate.data.frame(., 
  .         across(where(is.numeric), ~na.approx(.x, na.rm = FALSE))), 
  .     mutate_cols(.data, dplyr_quosures(...), by), withCallingHandlers(for (i in seq_along(dots)) {
  .         poke_error_context(dots, i, mask = mask)
  .         context_poke("column", old_current_column)
  .         new_columns <- mutate_col(dots[[i]], data, mask, new_columns)
  .     }, error = dplyr_error_handler(dots = dots, mask = mask, 
  .         bullets = mutate_bullets, error_call = error_call, error_class = "dplyr:::mutate_error"), 
  .         warning = dplyr_warning_handler(state = warnings_state, 
  .             mask = mask, error_call = error_call)), mutate_col(dots[[i]], 
  .         data, mask, new_columns), withCallingHandlers(mask$eval_all_mutate(quo), 
  .         error = function(cnd) {
  .             name <- dplyr_quosure_name(quo_data)
  .             msg <- glue("Can't compute column `{name}`.")
  .             abort(msg, call = call("across"), parent = cnd)
  .         }), mask$eval_all_mutate(quo), eval(), .handleSimpleError(`<fn>`, 
  .         "\"na.approx\" fonksiyonu bulunamadı", base::quote(na.approx(hour, 
  .             na.rm = FALSE))), h(simpleError(msg, call)), abort(msg, 
  .         call = call("across"), parent = cnd)), parent = c(0L, 
  . 1L, 2L, 3L, 4L, 5L, 6L, 7L, 6L, 9L, 10L, 4L, 12L, 13L, 13L, 15L, 
  . 16L, 17L, 18L, 19L, 13L, 13L, 13L, 23L, 24L, 0L, 0L, 0L, 28L, 
  . 29L, 29L, 31L, 31L, 33L, 0L, 35L, 36L), visible = c(TRUE, TRUE, 
  . TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
  . TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
  . TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, 
  . FALSE, FALSE), namespace = c("IRkernel", NA, "IRkernel", NA, 
  . "base", "base", "base", "base", "base", "base", "base", "evaluate", 
  . "evaluate", "evaluate", "evaluate", "base", "base", "base", "base", 
  . "base", "base", "base", "evaluate", "base", "base", NA, "dplyr", 
  . "dplyr", "dplyr", "base", "dplyr", "base", NA, "dplyr", "base", 
  . "dplyr", "rlang"), scope = c("::", NA, "local", NA, "::", "local", 
  . "local", "local", "local", "local", "local", "::", ":::", "local", 
  . "local", "::", "::", "local", "local", "local", "::", "::", ":::", 
  . "::", "::", NA, "::", ":::", ":::", "::", ":::", "::", NA, "local", 
  . "::", "local", "::")), row.names = c(NA, -37L), version = 2L, class = c("rlang_trace", 
  . "rlib_trace", "tbl", "data.frame")), parent = structure(list(
  .     message = "\"na.approx\" fonksiyonu bulunamadı", call = na.approx(hour, 
  .         na.rm = FALSE)), class = c("simpleError", "error", "condition"
  . )), rlang = list(inherit = TRUE), call = across(), use_cli_format = TRUE), class = c("rlang_error", 
  . "error", "condition")))
16. abort(message, class = error_class, parent = parent, call = error_call)
17. signal_abort(cnd, .file)
In [6]:
#Coordinate aggregation by long to wide format
long_weather <- weather_data
long_weather <- melt(weather_data,id.vars=c(1:4))
hourly_region_averages = dcast(long_weather, date+hour~variable,fun.aggregate=mean)
View(hourly_region_averages)
Warning message in melt.data.table(weather_data, id.vars = c(1:4)):
“'measure.vars' [dswrf_surface, tcdc_low.cloud.layer, tcdc_middle.cloud.layer, tcdc_high.cloud.layer, ...] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'. All measure variables not of type 'double' will be coerced too. Check DETAILS in ?melt.data.table for more on coercion.”
A data.table: 20898 × 12
datehourdswrf_surfacetcdc_low.cloud.layertcdc_middle.cloud.layertcdc_high.cloud.layertcdc_entire.atmosphereuswrf_top_of_atmospherecsnow_surfacedlwrf_surfaceuswrf_surfacetmp_surface
<IDate><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
2022-01-01 4 0.0000 2.384 5.944 4.604 14.296 0.000000.00227.9990 0.00000269.220
2022-01-01 5 0.0000 2.784 4.32410.636 19.272 0.000000.00227.7740 0.00000269.104
2022-01-01 6 0.0000 2.964 5.37211.688 21.772 0.000000.00227.7640 0.00000269.035
2022-01-01 7 0.0000 3.284 9.21220.736 31.992 0.000000.00228.1960 0.00000269.001
2022-01-01 8 0.0000 3.67211.25226.432 38.376 0.000000.00228.6570 0.00000269.002
2022-01-01 9 7.3688 4.12010.88035.088 45.856 8.967040.00229.4160 2.41280271.634
2022-01-0110180.1384 4.18013.74880.764 85.364135.204480.00237.2910 59.18848275.786
2022-01-0111254.7928 3.97216.52069.392 75.468152.989440.00237.3870 82.53120278.553
2022-01-0112312.5520 4.40823.46860.516 70.368167.553920.00238.3870 99.73056280.215
2022-01-0113347.4144 5.34832.99654.852 70.452180.693120.00240.8870109.43168280.609
2022-01-0114364.6544 6.77239.46045.880 71.136187.038720.00243.0150113.51040280.277
2022-01-0115362.2984 9.11643.45239.864 71.612188.886400.00245.2960112.10304279.165
2022-01-0116221.458437.10082.29612.680 89.116168.564480.00260.7560 66.06976277.059
2022-01-0117157.644837.03278.25217.376 85.688133.180800.00258.9600 47.22432273.467
2022-01-0118107.308035.87674.75227.016 83.300 92.252160.00257.2560 32.11968271.659
2022-01-0119 80.479235.08072.48037.268 83.508 69.187200.00256.5060 24.08960271.537
2022-01-0120 64.386436.09675.23644.544 85.644 55.349760.00257.7050 19.27104271.715
2022-01-0121 53.652837.02877.63249.756 87.520 46.123520.00259.4850 16.06016271.855
2022-01-0122 0.000049.73290.68868.948 98.276 0.000000.00272.8750 0.00000272.029
2022-01-0123 0.000056.53292.58069.140 98.380 0.000000.00274.7220 0.00000272.231
2022-01-02 0 0.000066.81294.14076.844 98.760 0.000000.04280.7740 0.00000272.513
2022-01-02 1 0.000071.09694.21280.404 98.492 0.000000.20283.7540 0.00000272.616
2022-01-02 2 0.000074.58094.00084.192 98.792 0.000000.32287.3460 0.00000273.024
2022-01-02 3 0.000077.66094.55686.644 98.980 0.000000.48291.0600 0.00000273.426
2022-01-02 4 0.000093.81691.58899.228100.000 0.000000.60312.8040 0.00000273.605
2022-01-02 5 0.000092.82088.04899.388100.000 0.000000.60311.7720 0.00000273.537
2022-01-02 6 0.000093.30489.96099.588100.000 0.000000.68312.2060 0.00000273.710
2022-01-02 7 0.000093.49291.85699.696100.000 0.000000.64312.9950 0.00000273.837
2022-01-02 8 0.000093.52093.40899.756100.000 0.000000.64313.6920 0.00000273.885
2022-01-02 9 1.196893.87294.48899.656100.000 13.378560.64314.2307 0.53568274.263
⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮
2024-05-1916741.68400 3.74812.40013.30826.132184.146560315.8730129.28704300.115
2024-05-1917653.24720 4.83613.92015.87630.608177.169280315.2560119.13024297.226
2024-05-1918551.01840 6.66417.33222.38439.596171.593600315.9100104.58816293.967
2024-05-1919451.50800 7.39217.69233.54048.368157.553280315.3600 87.64096290.504
2024-05-1920365.32160 8.28816.82444.92856.940132.322560314.6400 70.99328287.699
2024-05-1921304.43328 7.72014.72452.29662.416110.266880313.1420 59.15904286.748
2024-05-1922 0.00000 0.932 0.14486.08486.800 0.000000302.1770 0.00000286.008
2024-05-1923 0.00000 0.544 0.14475.57276.660 0.000000300.4080 0.00000285.319
2024-05-20 0 0.00000 0.364 0.30064.07665.364 0.000000298.4437 0.00000284.668
2024-05-20 1 0.00000 0.268 0.30052.52453.820 0.000000296.7120 0.00000284.064
2024-05-20 2 0.00000 0.216 0.30444.52845.840 0.000000295.4120 0.00000283.597
2024-05-20 3 0.00000 0.180 0.46439.21240.740 0.000000294.4750 0.00000283.175
2024-05-20 4 0.00000 0.104 3.63233.51237.208 0.000000289.2320 0.00000282.832
2024-05-20 5 0.00000 0.260 3.59643.00046.432 0.000000289.8500 0.00000282.536
2024-05-20 6 3.35600 0.904 5.81639.87245.740 4.071680290.3950 0.77888283.660
2024-05-20 7 35.98848 1.968 7.48833.31641.208 23.182080290.5790 9.36448288.299
2024-05-20 8 88.90048 3.75211.00029.28441.020 48.746880293.7390 20.81152292.868
2024-05-20 9148.04160 5.27215.51226.25242.420 77.782400297.8070 31.73120296.414
2024-05-2010623.0600016.52034.47612.79645.516239.668800318.9140109.52576300.288
2024-05-2011697.5448010.80832.82811.09643.596243.469600321.0400118.63680303.036
2024-05-2012757.7952010.52430.45611.92042.148243.226880322.7000125.46112304.884
2024-05-2013786.5728012.06829.52413.81641.716251.427840325.2360128.43776305.034
2024-05-2014780.9112014.28431.84017.04445.484270.328960329.1890127.48992303.220
2024-05-2015748.3968017.02835.12022.48852.052294.024960333.3480123.05600300.109
2024-05-2016428.6152034.72857.82449.88892.296423.560960358.2960 79.20320297.543
2024-05-2017367.4000034.49262.11244.06893.800396.170880359.0440 69.55136295.564
2024-05-2018312.2416034.68466.84842.72894.424354.529920358.4140 60.48000293.613
2024-05-2019261.5696031.08066.27649.85293.684303.417600354.7380 51.42720291.316
2024-05-2020212.6873630.61664.84857.58893.680249.778560351.5110 41.86048288.794
2024-05-2021177.2409629.01662.47263.93694.160208.147840347.9070 34.88192288.011
In [7]:
# Merge with hourly_region_averages
template_dt_with_weather <- merge(template_dt, hourly_region_averages, by = c('date', 'hour'), all.x = TRUE)
View(template_dt_with_weather)
#Order it by date and hour
template_dt_with_weather = template_dt_with_weather[order(date,hour)]

template_dt_with_aggregate <- template_dt_with_weather

template_dt_with_aggregate$hourly_cloud_average <- rowMeans(select(template_dt_with_aggregate, starts_with("tcdc_")), na.rm = TRUE)
template_dt_with_aggregate$hourly_max_t <- rowMaxs(as.matrix(select(template_dt_with_aggregate, starts_with("tmp_"))), na.rm = TRUE)
View(template_dt_with_aggregate)
A data.table: 20898 × 13
datehourproductiondswrf_surfacetcdc_low.cloud.layertcdc_middle.cloud.layertcdc_high.cloud.layertcdc_entire.atmosphereuswrf_top_of_atmospherecsnow_surfacedlwrf_surfaceuswrf_surfacetmp_surface
<IDate><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
2022-01-01 40.00 0.0000 2.384 5.944 4.604 14.296 0.000000.00227.9990 0.00000269.220
2022-01-01 50.00 0.0000 2.784 4.32410.636 19.272 0.000000.00227.7740 0.00000269.104
2022-01-01 60.00 0.0000 2.964 5.37211.688 21.772 0.000000.00227.7640 0.00000269.035
2022-01-01 70.00 0.0000 3.284 9.21220.736 31.992 0.000000.00228.1960 0.00000269.001
2022-01-01 83.40 0.0000 3.67211.25226.432 38.376 0.000000.00228.6570 0.00000269.002
2022-01-01 96.80 7.3688 4.12010.88035.088 45.856 8.967040.00229.4160 2.41280271.634
2022-01-01109.38180.1384 4.18013.74880.764 85.364135.204480.00237.2910 59.18848275.786
2022-01-01117.65254.7928 3.97216.52069.392 75.468152.989440.00237.3870 82.53120278.553
2022-01-01126.80312.5520 4.40823.46860.516 70.368167.553920.00238.3870 99.73056280.215
2022-01-01135.10347.4144 5.34832.99654.852 70.452180.693120.00240.8870109.43168280.609
2022-01-01145.10364.6544 6.77239.46045.880 71.136187.038720.00243.0150113.51040280.277
2022-01-01151.70362.2984 9.11643.45239.864 71.612188.886400.00245.2960112.10304279.165
2022-01-01160.00221.458437.10082.29612.680 89.116168.564480.00260.7560 66.06976277.059
2022-01-01170.00157.644837.03278.25217.376 85.688133.180800.00258.9600 47.22432273.467
2022-01-01180.00107.308035.87674.75227.016 83.300 92.252160.00257.2560 32.11968271.659
2022-01-01190.00 80.479235.08072.48037.268 83.508 69.187200.00256.5060 24.08960271.537
2022-01-01200.00 64.386436.09675.23644.544 85.644 55.349760.00257.7050 19.27104271.715
2022-01-01210.00 53.652837.02877.63249.756 87.520 46.123520.00259.4850 16.06016271.855
2022-01-01220.00 0.000049.73290.68868.948 98.276 0.000000.00272.8750 0.00000272.029
2022-01-01230.00 0.000056.53292.58069.140 98.380 0.000000.00274.7220 0.00000272.231
2022-01-02 00.00 0.000066.81294.14076.844 98.760 0.000000.04280.7740 0.00000272.513
2022-01-02 10.00 0.000071.09694.21280.404 98.492 0.000000.20283.7540 0.00000272.616
2022-01-02 20.00 0.000074.58094.00084.192 98.792 0.000000.32287.3460 0.00000273.024
2022-01-02 30.00 0.000077.66094.55686.644 98.980 0.000000.48291.0600 0.00000273.426
2022-01-02 40.00 0.000093.81691.58899.228100.000 0.000000.60312.8040 0.00000273.605
2022-01-02 50.00 0.000092.82088.04899.388100.000 0.000000.60311.7720 0.00000273.537
2022-01-02 60.00 0.000093.30489.96099.588100.000 0.000000.68312.2060 0.00000273.710
2022-01-02 70.00 0.000093.49291.85699.696100.000 0.000000.64312.9950 0.00000273.837
2022-01-02 80.00 0.000093.52093.40899.756100.000 0.000000.64313.6920 0.00000273.885
2022-01-02 90.85 1.196893.87294.48899.656100.000 13.378560.64314.2307 0.53568274.263
⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮
2024-05-1916NA741.68400 3.74812.40013.30826.132184.146560315.8730129.28704300.115
2024-05-1917NA653.24720 4.83613.92015.87630.608177.169280315.2560119.13024297.226
2024-05-1918NA551.01840 6.66417.33222.38439.596171.593600315.9100104.58816293.967
2024-05-1919NA451.50800 7.39217.69233.54048.368157.553280315.3600 87.64096290.504
2024-05-1920NA365.32160 8.28816.82444.92856.940132.322560314.6400 70.99328287.699
2024-05-1921NA304.43328 7.72014.72452.29662.416110.266880313.1420 59.15904286.748
2024-05-1922NA 0.00000 0.932 0.14486.08486.800 0.000000302.1770 0.00000286.008
2024-05-1923NA 0.00000 0.544 0.14475.57276.660 0.000000300.4080 0.00000285.319
2024-05-20 0NA 0.00000 0.364 0.30064.07665.364 0.000000298.4437 0.00000284.668
2024-05-20 1NA 0.00000 0.268 0.30052.52453.820 0.000000296.7120 0.00000284.064
2024-05-20 2NA 0.00000 0.216 0.30444.52845.840 0.000000295.4120 0.00000283.597
2024-05-20 3NA 0.00000 0.180 0.46439.21240.740 0.000000294.4750 0.00000283.175
2024-05-20 4NA 0.00000 0.104 3.63233.51237.208 0.000000289.2320 0.00000282.832
2024-05-20 5NA 0.00000 0.260 3.59643.00046.432 0.000000289.8500 0.00000282.536
2024-05-20 6NA 3.35600 0.904 5.81639.87245.740 4.071680290.3950 0.77888283.660
2024-05-20 7NA 35.98848 1.968 7.48833.31641.208 23.182080290.5790 9.36448288.299
2024-05-20 8NA 88.90048 3.75211.00029.28441.020 48.746880293.7390 20.81152292.868
2024-05-20 9NA148.04160 5.27215.51226.25242.420 77.782400297.8070 31.73120296.414
2024-05-2010NA623.0600016.52034.47612.79645.516239.668800318.9140109.52576300.288
2024-05-2011NA697.5448010.80832.82811.09643.596243.469600321.0400118.63680303.036
2024-05-2012NA757.7952010.52430.45611.92042.148243.226880322.7000125.46112304.884
2024-05-2013NA786.5728012.06829.52413.81641.716251.427840325.2360128.43776305.034
2024-05-2014NA780.9112014.28431.84017.04445.484270.328960329.1890127.48992303.220
2024-05-2015NA748.3968017.02835.12022.48852.052294.024960333.3480123.05600300.109
2024-05-2016NA428.6152034.72857.82449.88892.296423.560960358.2960 79.20320297.543
2024-05-2017NA367.4000034.49262.11244.06893.800396.170880359.0440 69.55136295.564
2024-05-2018NA312.2416034.68466.84842.72894.424354.529920358.4140 60.48000293.613
2024-05-2019NA261.5696031.08066.27649.85293.684303.417600354.7380 51.42720291.316
2024-05-2020NA212.6873630.61664.84857.58893.680249.778560351.5110 41.86048288.794
2024-05-2021NA177.2409629.01662.47263.93694.160208.147840347.9070 34.88192288.011
A data.table: 20898 × 15
datehourproductiondswrf_surfacetcdc_low.cloud.layertcdc_middle.cloud.layertcdc_high.cloud.layertcdc_entire.atmosphereuswrf_top_of_atmospherecsnow_surfacedlwrf_surfaceuswrf_surfacetmp_surfacehourly_cloud_averagehourly_max_t
<IDate><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
2022-01-01 40.00 0.0000 2.384 5.944 4.604 14.296 0.000000.00227.9990 0.00000269.220 6.807269.220
2022-01-01 50.00 0.0000 2.784 4.32410.636 19.272 0.000000.00227.7740 0.00000269.104 9.254269.104
2022-01-01 60.00 0.0000 2.964 5.37211.688 21.772 0.000000.00227.7640 0.00000269.03510.449269.035
2022-01-01 70.00 0.0000 3.284 9.21220.736 31.992 0.000000.00228.1960 0.00000269.00116.306269.001
2022-01-01 83.40 0.0000 3.67211.25226.432 38.376 0.000000.00228.6570 0.00000269.00219.933269.002
2022-01-01 96.80 7.3688 4.12010.88035.088 45.856 8.967040.00229.4160 2.41280271.63423.986271.634
2022-01-01109.38180.1384 4.18013.74880.764 85.364135.204480.00237.2910 59.18848275.78646.014275.786
2022-01-01117.65254.7928 3.97216.52069.392 75.468152.989440.00237.3870 82.53120278.55341.338278.553
2022-01-01126.80312.5520 4.40823.46860.516 70.368167.553920.00238.3870 99.73056280.21539.690280.215
2022-01-01135.10347.4144 5.34832.99654.852 70.452180.693120.00240.8870109.43168280.60940.912280.609
2022-01-01145.10364.6544 6.77239.46045.880 71.136187.038720.00243.0150113.51040280.27740.812280.277
2022-01-01151.70362.2984 9.11643.45239.864 71.612188.886400.00245.2960112.10304279.16541.011279.165
2022-01-01160.00221.458437.10082.29612.680 89.116168.564480.00260.7560 66.06976277.05955.298277.059
2022-01-01170.00157.644837.03278.25217.376 85.688133.180800.00258.9600 47.22432273.46754.587273.467
2022-01-01180.00107.308035.87674.75227.016 83.300 92.252160.00257.2560 32.11968271.65955.236271.659
2022-01-01190.00 80.479235.08072.48037.268 83.508 69.187200.00256.5060 24.08960271.53757.084271.537
2022-01-01200.00 64.386436.09675.23644.544 85.644 55.349760.00257.7050 19.27104271.71560.380271.715
2022-01-01210.00 53.652837.02877.63249.756 87.520 46.123520.00259.4850 16.06016271.85562.984271.855
2022-01-01220.00 0.000049.73290.68868.948 98.276 0.000000.00272.8750 0.00000272.02976.911272.029
2022-01-01230.00 0.000056.53292.58069.140 98.380 0.000000.00274.7220 0.00000272.23179.158272.231
2022-01-02 00.00 0.000066.81294.14076.844 98.760 0.000000.04280.7740 0.00000272.51384.139272.513
2022-01-02 10.00 0.000071.09694.21280.404 98.492 0.000000.20283.7540 0.00000272.61686.051272.616
2022-01-02 20.00 0.000074.58094.00084.192 98.792 0.000000.32287.3460 0.00000273.02487.891273.024
2022-01-02 30.00 0.000077.66094.55686.644 98.980 0.000000.48291.0600 0.00000273.42689.460273.426
2022-01-02 40.00 0.000093.81691.58899.228100.000 0.000000.60312.8040 0.00000273.60596.158273.605
2022-01-02 50.00 0.000092.82088.04899.388100.000 0.000000.60311.7720 0.00000273.53795.064273.537
2022-01-02 60.00 0.000093.30489.96099.588100.000 0.000000.68312.2060 0.00000273.71095.713273.710
2022-01-02 70.00 0.000093.49291.85699.696100.000 0.000000.64312.9950 0.00000273.83796.261273.837
2022-01-02 80.00 0.000093.52093.40899.756100.000 0.000000.64313.6920 0.00000273.88596.671273.885
2022-01-02 90.85 1.196893.87294.48899.656100.000 13.378560.64314.2307 0.53568274.26397.004274.263
⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮
2024-05-1916NA741.68400 3.74812.40013.30826.132184.146560315.8730129.28704300.11513.897300.115
2024-05-1917NA653.24720 4.83613.92015.87630.608177.169280315.2560119.13024297.22616.310297.226
2024-05-1918NA551.01840 6.66417.33222.38439.596171.593600315.9100104.58816293.96721.494293.967
2024-05-1919NA451.50800 7.39217.69233.54048.368157.553280315.3600 87.64096290.50426.748290.504
2024-05-1920NA365.32160 8.28816.82444.92856.940132.322560314.6400 70.99328287.69931.745287.699
2024-05-1921NA304.43328 7.72014.72452.29662.416110.266880313.1420 59.15904286.74834.289286.748
2024-05-1922NA 0.00000 0.932 0.14486.08486.800 0.000000302.1770 0.00000286.00843.490286.008
2024-05-1923NA 0.00000 0.544 0.14475.57276.660 0.000000300.4080 0.00000285.31938.230285.319
2024-05-20 0NA 0.00000 0.364 0.30064.07665.364 0.000000298.4437 0.00000284.66832.526284.668
2024-05-20 1NA 0.00000 0.268 0.30052.52453.820 0.000000296.7120 0.00000284.06426.728284.064
2024-05-20 2NA 0.00000 0.216 0.30444.52845.840 0.000000295.4120 0.00000283.59722.722283.597
2024-05-20 3NA 0.00000 0.180 0.46439.21240.740 0.000000294.4750 0.00000283.17520.149283.175
2024-05-20 4NA 0.00000 0.104 3.63233.51237.208 0.000000289.2320 0.00000282.83218.614282.832
2024-05-20 5NA 0.00000 0.260 3.59643.00046.432 0.000000289.8500 0.00000282.53623.322282.536
2024-05-20 6NA 3.35600 0.904 5.81639.87245.740 4.071680290.3950 0.77888283.66023.083283.660
2024-05-20 7NA 35.98848 1.968 7.48833.31641.208 23.182080290.5790 9.36448288.29920.995288.299
2024-05-20 8NA 88.90048 3.75211.00029.28441.020 48.746880293.7390 20.81152292.86821.264292.868
2024-05-20 9NA148.04160 5.27215.51226.25242.420 77.782400297.8070 31.73120296.41422.364296.414
2024-05-2010NA623.0600016.52034.47612.79645.516239.668800318.9140109.52576300.28827.327300.288
2024-05-2011NA697.5448010.80832.82811.09643.596243.469600321.0400118.63680303.03624.582303.036
2024-05-2012NA757.7952010.52430.45611.92042.148243.226880322.7000125.46112304.88423.762304.884
2024-05-2013NA786.5728012.06829.52413.81641.716251.427840325.2360128.43776305.03424.281305.034
2024-05-2014NA780.9112014.28431.84017.04445.484270.328960329.1890127.48992303.22027.163303.220
2024-05-2015NA748.3968017.02835.12022.48852.052294.024960333.3480123.05600300.10931.672300.109
2024-05-2016NA428.6152034.72857.82449.88892.296423.560960358.2960 79.20320297.54358.684297.543
2024-05-2017NA367.4000034.49262.11244.06893.800396.170880359.0440 69.55136295.56458.618295.564
2024-05-2018NA312.2416034.68466.84842.72894.424354.529920358.4140 60.48000293.61359.671293.613
2024-05-2019NA261.5696031.08066.27649.85293.684303.417600354.7380 51.42720291.31660.223291.316
2024-05-2020NA212.6873630.61664.84857.58893.680249.778560351.5110 41.86048288.79461.683288.794
2024-05-2021NA177.2409629.01662.47263.93694.160208.147840347.9070 34.88192288.01162.396288.011
In [8]:
# Use select to exclude coumns starting with "tcdc_" to focus on average
template_dt_with_aggregate <- template_dt_with_aggregate %>%
  select(-starts_with("tcdc_"))
#-starts_with("tmp_"))

# Read your holiday dataset
#holiday_data <- read.csv("/Users/ecemozturk/Desktop/Short Holiday.csv")
holiday_data <- read_excel("/Users/ecemozturk/Desktop/Short Holiday12.xlsx")
# Convert date column to Date format
holiday_data$date <- as.Date(holiday_data$date, format = "%d.%m.%Y", na.rm = TRUE)

# Merge holiday data with production dataset based on the date column
all_data <- merge(template_dt_with_aggregate, holiday_data, by = "date", all.x = TRUE)


all_data = all_data[!is.na(production)]
all_data_daily <- all_data[all_data$date == day_before_tday, ]

available_data = all_data[!is.na(production) & hour >= 4 & hour <= 19,]
#to_be_forecasted = template_dt_with_weather[is.na(production)]
to_be_forecasted <- all_data[is.na(production) & hour >= 4 & hour <= 19, ]

View(all_data)
A data.table: 20780 × 16
datehourproductiondswrf_surfaceuswrf_top_of_atmospherecsnow_surfacedlwrf_surfaceuswrf_surfacetmp_surfacehourly_cloud_averagehourly_max_tis.weekendis.ramadanis.religousdayis.nationaldayis.publicholiday
<IDate><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
2022-01-01 40.00 0.0000 0.000000.00227.9990 0.00000269.220 6.807269.22010011
2022-01-01 50.00 0.0000 0.000000.00227.7740 0.00000269.104 9.254269.10410011
2022-01-01 60.00 0.0000 0.000000.00227.7640 0.00000269.03510.449269.03510011
2022-01-01 70.00 0.0000 0.000000.00228.1960 0.00000269.00116.306269.00110011
2022-01-01 83.40 0.0000 0.000000.00228.6570 0.00000269.00219.933269.00210011
2022-01-01 96.80 7.3688 8.967040.00229.4160 2.41280271.63423.986271.63410011
2022-01-01109.38180.1384135.204480.00237.2910 59.18848275.78646.014275.78610011
2022-01-01117.65254.7928152.989440.00237.3870 82.53120278.55341.338278.55310011
2022-01-01126.80312.5520167.553920.00238.3870 99.73056280.21539.690280.21510011
2022-01-01135.10347.4144180.693120.00240.8870109.43168280.60940.912280.60910011
2022-01-01145.10364.6544187.038720.00243.0150113.51040280.27740.812280.27710011
2022-01-01151.70362.2984188.886400.00245.2960112.10304279.16541.011279.16510011
2022-01-01160.00221.4584168.564480.00260.7560 66.06976277.05955.298277.05910011
2022-01-01170.00157.6448133.180800.00258.9600 47.22432273.46754.587273.46710011
2022-01-01180.00107.3080 92.252160.00257.2560 32.11968271.65955.236271.65910011
2022-01-01190.00 80.4792 69.187200.00256.5060 24.08960271.53757.084271.53710011
2022-01-01200.00 64.3864 55.349760.00257.7050 19.27104271.71560.380271.71510011
2022-01-01210.00 53.6528 46.123520.00259.4850 16.06016271.85562.984271.85510011
2022-01-01220.00 0.0000 0.000000.00272.8750 0.00000272.02976.911272.02910011
2022-01-01230.00 0.0000 0.000000.00274.7220 0.00000272.23179.158272.23110011
2022-01-02 00.00 0.0000 0.000000.04280.7740 0.00000272.51384.139272.51310001
2022-01-02 10.00 0.0000 0.000000.20283.7540 0.00000272.61686.051272.61610001
2022-01-02 20.00 0.0000 0.000000.32287.3460 0.00000273.02487.891273.02410001
2022-01-02 30.00 0.0000 0.000000.48291.0600 0.00000273.42689.460273.42610001
2022-01-02 40.00 0.0000 0.000000.60312.8040 0.00000273.60596.158273.60510001
2022-01-02 50.00 0.0000 0.000000.60311.7720 0.00000273.53795.064273.53710001
2022-01-02 60.00 0.0000 0.000000.68312.2060 0.00000273.71095.713273.71010001
2022-01-02 70.00 0.0000 0.000000.64312.9950 0.00000273.83796.261273.83710001
2022-01-02 80.00 0.0000 0.000000.64313.6920 0.00000273.88596.671273.88510001
2022-01-02 90.85 1.1968 13.378560.64314.2307 0.53568274.26397.004274.26310001
⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮
2024-05-14180.14568.46960169.699200264.016110.46400286.981010.242286.981000000
2024-05-14190.00469.56800150.267520261.975 94.29376283.1640 9.776283.164000000
2024-05-14200.00379.66336124.106240259.407 76.52352279.9370 8.808279.937000000
2024-05-14210.00316.38528103.420800257.020 63.77152279.0600 8.659279.060000000
2024-05-14220.00 0.00000 0.000000243.652 0.00000278.387014.836278.387000000
2024-05-14230.00 0.00000 0.000000242.829 0.00000277.755015.197277.755000000
2024-05-15 00.00 0.00000 0.000000241.878 0.00000277.144014.826277.144000000
2024-05-15 10.00 0.00000 0.000000241.984 0.00000276.810018.349276.810000000
2024-05-15 20.00 0.00000 0.000000243.318 0.00000276.690024.441276.690000000
2024-05-15 30.00 0.00000 0.000000246.041 0.00000276.952030.521276.952000000
2024-05-15 40.08 0.00000 0.000000265.722 0.00000276.960062.846276.960000000
2024-05-15 50.84 0.00000 0.000000268.502 0.00000276.987064.873276.987000000
2024-05-15 62.94 2.19840 4.119680270.013 0.44928277.985066.249277.985000000
2024-05-15 76.58 23.85280 32.986880271.004 4.95872281.337066.061281.337000000
2024-05-15 89.13 70.55808 64.856320271.370 14.99968285.493065.294285.493000000
2024-05-15 99.82133.17440 93.235840272.017 27.34272290.060064.018290.060000000
2024-05-15109.87671.21840220.832640271.943122.53632295.173043.818295.173000000
2024-05-15119.82770.01600205.898240270.559133.23072299.452033.187299.452000000
2024-05-15127.92837.92160199.863680271.397139.62432301.116024.561301.116000000
2024-05-15137.18877.82480199.393920273.757143.27296301.077021.357301.077000000
2024-05-15146.00891.83360201.669120276.612144.44480300.165620.904300.165600000
2024-05-15153.11880.27072207.318400280.244143.31904298.016023.678298.016000000
2024-05-15160.00645.33440259.095040305.592117.19168295.492053.864295.492000000
2024-05-15170.00563.07520245.660800305.315105.66080292.580054.906292.580000000
2024-05-15180.00479.35600223.520640304.388 92.77632289.556052.225289.556000000
2024-05-15190.00394.93824195.495040302.328 77.94688286.233050.264286.233000000
2024-05-15200.00319.39200161.115520301.056 63.09376283.785049.482283.785000000
2024-05-15210.00266.16064134.261760300.140 52.57856283.200048.427283.200000000
2024-05-15220.00 0.00000 0.000000290.460 0.00000282.466038.033282.466000000
2024-05-15230.00 0.00000 0.000000286.686 0.00000281.743034.667281.743000000
In [ ]:

In [9]:
# Plot production data as a line graph
plot(all_data$date, all_data$production, type = "l", xlab = "Date", ylab = "Production", main = "Production Data")

Linear regression model:

To make a prediction by using linear regression, our first attempt was to try to create an aggregate linear regression model by using the parameters defined above. In addition to these parameters, once the aggregate production plot was analyzed, one can clearly detect a shift in the production on the period approximately in between 01-06-2022 and 16-11-2022. To eliminate its effect in our model, we have defined special_period parameter as indicated below. Lastly, we can see a clear pattern where the production reaches up to capacity level of approximately 10 and does not go beyond that level.

Since each hour has different production levels and its parameters take different values, we have created hourly-linear regressions to make forecasts for each hour seperately and eliminate hourly seasonality. We have prepared linear regression models for hours in between 4 & 18 since there is usually no production at hours 0,1,2,3 and 19,20,21,22,23. We have started our analysis by preparing a lm model including all the parameters introduced. After checking the summary and ACF plots of each hour, we have decided to include the ones with large significance and discard the insignificant ones from our model. For each hour, the same assumption was followed.

After doing that, we have introduced another variable to capture the effect of lags (indicated as lag_1_production). In most cases, lag time was selected as 1.

In [97]:
# Convert wday and month to characters
all_data[, wday := as.character(wday(date, label = TRUE))]
all_data[, month := as.character(month(date, label = TRUE))]

#special period was defined to represent the period when the capacity reaches up to 12.
all_data$special_period <- as.numeric(all_data$date >= as.Date("2022-06-01") & all_data$date <= as.Date("2022-11-16"))

#Linear regression for cummulative production
lm_model <- lm(production ~.+hourly_max_t*month+wday  -date, data = all_data)
summary(lm_model)
Call:
lm(formula = production ~ . + hourly_max_t * month + wday - date, 
    data = all_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.2566 -1.3381 -0.2832  1.1297 10.1386 

Coefficients: (2 not defined because of singularities)
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)             -1.183e+02  1.806e+00 -65.491  < 2e-16 ***
hour                    -7.434e-02  2.426e-03 -30.643  < 2e-16 ***
dswrf_surface           -8.027e-03  2.250e-04 -35.682  < 2e-16 ***
uswrf_top_of_atmosphere  6.639e-03  2.504e-04  26.511  < 2e-16 ***
csnow_surface            2.107e+00  1.078e-01  19.540  < 2e-16 ***
dlwrf_surface           -8.416e-02  9.777e-04 -86.087  < 2e-16 ***
uswrf_surface            1.251e-02  6.585e-04  19.005  < 2e-16 ***
tmp_surface              5.074e-01  6.415e-03  79.098  < 2e-16 ***
hourly_cloud_average     2.601e-02  9.883e-04  26.316  < 2e-16 ***
hourly_max_t                    NA         NA      NA       NA    
is.weekend               4.192e-01  2.437e-01   1.720 0.085457 .  
is.ramadan              -3.223e-01  6.984e-02  -4.615 3.96e-06 ***
is.religousday           1.978e-01  9.355e-02   2.114 0.034518 *  
is.nationalday           2.637e-01  1.608e-01   1.640 0.101032    
is.publicholiday        -6.239e-01  2.425e-01  -2.573 0.010102 *  
wdayCum                 -1.783e-01  5.583e-02  -3.193 0.001409 ** 
wdayÇar                 -1.642e-01  5.564e-02  -2.950 0.003180 ** 
wdayPaz                 -1.141e-01  5.577e-02  -2.046 0.040762 *  
wdayPer                 -2.192e-01  5.578e-02  -3.929 8.54e-05 ***
wdayPts                 -1.291e-01  5.565e-02  -2.320 0.020357 *  
wdaySal                         NA         NA      NA       NA    
monthAra                -7.767e+00  3.301e+00  -2.353 0.018632 *  
monthEki                 1.333e-01  2.530e+00   0.053 0.957989    
monthEyl                -4.919e-01  2.140e+00  -0.230 0.818168    
monthHaz                -3.846e+01  2.665e+00 -14.432  < 2e-16 ***
monthKas                 4.319e+00  2.868e+00   1.506 0.132159    
monthMar                 9.223e+00  2.223e+00   4.149 3.36e-05 ***
monthMay                -2.338e+01  2.367e+00  -9.876  < 2e-16 ***
monthNis                -8.559e+00  2.200e+00  -3.891 0.000100 ***
monthOca                 7.182e+00  2.500e+00   2.873 0.004070 ** 
monthŞub                 2.010e+01  2.354e+00   8.537  < 2e-16 ***
monthTem                -1.940e+01  2.222e+00  -8.731  < 2e-16 ***
special_period           2.101e-01  4.691e-02   4.479 7.54e-06 ***
hourly_max_t:monthAra    3.382e-02  1.174e-02   2.881 0.003962 ** 
hourly_max_t:monthEki    5.904e-04  8.679e-03   0.068 0.945771    
hourly_max_t:monthEyl    1.397e-03  7.209e-03   0.194 0.846370    
hourly_max_t:monthHaz    1.356e-01  9.053e-03  14.979  < 2e-16 ***
hourly_max_t:monthKas   -1.199e-02  1.002e-02  -1.197 0.231334    
hourly_max_t:monthMar   -2.815e-02  7.718e-03  -3.647 0.000266 ***
hourly_max_t:monthMay    8.451e-02  8.114e-03  10.415  < 2e-16 ***
hourly_max_t:monthNis    3.230e-02  7.521e-03   4.294 1.76e-05 ***
hourly_max_t:monthOca   -2.041e-02  8.828e-03  -2.312 0.020798 *  
hourly_max_t:monthŞub   -6.574e-02  8.272e-03  -7.948 1.99e-15 ***
hourly_max_t:monthTem    6.668e-02  7.466e-03   8.931  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.144 on 20735 degrees of freedom
  (3 observations deleted due to missingness)
Multiple R-squared:  0.664,	Adjusted R-squared:  0.6633 
F-statistic: 999.3 on 41 and 20735 DF,  p-value: < 2.2e-16
In [98]:
available_data <- all_data[!is.na(production) & hour >= 4 & hour <= 18 & date < start_date,]
to_be_forecasted <- all_data[!is.na(production) & hour >= 4 & hour <= 18 & date >= start_date & date <= end_date, ]

library(dplyr)

Once we plot hour_4_data, we observe limited amount of data mainly because the sun usually does not rise at that time. Therefore, our regression value couldn't captured the trend and our predictions were significantly deviated from the actual values , as expected.

In [173]:
# Filter data for hour 4
hour_4_data <- all_data[all_data$hour == 4, ]
hour_4_data$trend_hour_4 <- 1:nrow(hour_4_data)
hour_4_data[, lag_1_production := shift(production,1)]
hour_4_data[,lag_1_diff:=production-lag_1_production]
hour_4_data <- hour_4_data[!is.na(lag_1_production)]
# Fit linear regression model for hour 4
lm_hour_4 <- lm(production ~+lag_1_production +dlwrf_surface+tmp_surface+hourly_cloud_average+special_period+trend_hour_4+month, data = hour_4_data)
# Summarize the model
summary(lm_hour_4)
checkresiduals(lm_hour_4)
Call:
lm(formula = production ~ +lag_1_production + dlwrf_surface + 
    tmp_surface + hourly_cloud_average + special_period + trend_hour_4 + 
    month, data = hour_4_data)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.121351 -0.001939 -0.000402  0.000778  0.173824 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)           5.239e-02  5.513e-02   0.950  0.34219    
lag_1_production      7.021e-01  2.467e-02  28.459  < 2e-16 ***
dlwrf_surface         1.859e-05  3.902e-05   0.476  0.63393    
tmp_surface          -1.980e-04  2.282e-04  -0.868  0.38588    
hourly_cloud_average -1.778e-05  2.921e-05  -0.609  0.54286    
special_period       -5.201e-03  1.516e-03  -3.431  0.00063 ***
trend_hour_4          3.035e-06  2.114e-06   1.436  0.15137    
monthAra             -4.042e-03  3.027e-03  -1.335  0.18217    
monthEki             -5.109e-04  2.435e-03  -0.210  0.83385    
monthEyl             -2.670e-04  2.246e-03  -0.119  0.90539    
monthHaz              1.294e-02  2.430e-03   5.325 1.29e-07 ***
monthKas             -2.199e-03  2.750e-03  -0.800  0.42413    
monthMar             -3.781e-03  3.050e-03  -1.239  0.21550    
monthMay             -7.163e-04  2.485e-03  -0.288  0.77320    
monthNis             -3.008e-03  2.612e-03  -1.152  0.24980    
monthOca             -3.981e-03  3.234e-03  -1.231  0.21862    
monthŞub             -4.200e-03  3.328e-03  -1.262  0.20728    
monthTem              5.044e-03  2.262e-03   2.230  0.02601 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.0119 on 847 degrees of freedom
Multiple R-squared:  0.6806,	Adjusted R-squared:  0.6742 
F-statistic: 106.2 on 17 and 847 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 21

data:  Residuals
LM test = 415.11, df = 21, p-value < 2.2e-16
In [177]:
#Production data for hour 4
hour_4_data <- all_data[all_data$hour == 4, ]

# Plot production for hour 4
ggplot(hour_4_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 4",
       x = "Date",
       y = "Production") +
  theme_minimal()
In [175]:
hour_4_data <- all_data %>%
  filter(hour == 4, date >= start_date & date <= end_date)

forecast_data_hour_4 <- to_be_forecasted %>%
  filter(hour == 4)
forecast_data_hour_4$trend_hour_4 <- nrow(hour_4_data) + 1:nrow(forecast_data_hour_4)

last_production_value <- tail(hour_4_data$production, 1)
forecast_data_hour_4 <- forecast_data_hour_4 %>%
  mutate(lag_1_production = last_production_value)

predictions_hour_4 <- predict(lm_hour_4, newdata = forecast_data_hour_4)
predictions_hour_4[predictions_hour_4 < 0] <- 0

predictions_hour_4_df <- data.frame(
  date = forecast_data_hour_4$date,  
  predicted = predictions_hour_4
)

merged_data <- merge(hour_4_data, predictions_hour_4_df, by = "date")

output_data <- merged_data %>%
  select(date, actual = production, predicted)

output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

print(paste("WMAPE:", wmape, "%"))

output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

file_path <- file.path(output_dir, "predictions_hour_4_with_actuals.xlsx")

write_xlsx(output_data, file_path)

print(output_data)
[1] "WMAPE: 1407.11310958475 %"
Key: <date>
           date actual  predicted       error
         <IDat>  <num>      <num>       <num>
  1: 2024-02-01   0.00 0.05524756 0.055247562
  2: 2024-02-02   0.00 0.05539115 0.055391154
  3: 2024-02-03   0.00 0.05517195 0.055171948
  4: 2024-02-04   0.00 0.05462935 0.054629350
  5: 2024-02-05   0.00 0.05435240 0.054352396
 ---                                         
101: 2024-05-11   0.03 0.05726587 0.027265872
102: 2024-05-12   0.00 0.05773654 0.057736539
103: 2024-05-13   0.06 0.05737257 0.002627426
104: 2024-05-14   0.07 0.05818364 0.011816364
105: 2024-05-15   0.08 0.05746433 0.022535670

Similar to hour 5, our model also couldn't improved in lm_hour_5.

In [171]:
# Filter data for hour 5
hour_5_data <- all_data[all_data$hour == 5, ]
hour_5_data <- hour_5_data[,-c(2)]
hour_5_data[, lag_1_production := shift(production,1)]
hour_5_data[,lag_1_diff:=production-lag_1_production]
hour_5_data <- hour_5_data[!is.na(lag_1_production)]
hour_5_data$trend_hour_5 <- 1:nrow(hour_5_data)
# Fit linear regression model for hour 5
lm_hour_5 <- lm(production ~+lag_1_production+dlwrf_surface+is.ramadan+special_period+trend_hour_5 +month*hourly_max_t, data = hour_5_data)
# Summarize the model
summary(lm_hour_5)
checkresiduals(lm_hour_5)
plot(lm_hour_5)
Call:
lm(formula = production ~ +lag_1_production + dlwrf_surface + 
    is.ramadan + special_period + trend_hour_5 + month * hourly_max_t, 
    data = hour_5_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.97514 -0.03869 -0.00864  0.02658  1.76482 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)           -1.507e+00  2.805e+00  -0.537   0.5912    
lag_1_production       5.048e-01  3.000e-02  16.826  < 2e-16 ***
dlwrf_surface          3.991e-04  2.609e-04   1.530   0.1265    
is.ramadan            -4.754e-02  2.091e-02  -2.273   0.0233 *  
special_period        -7.987e-02  1.669e-02  -4.785 2.02e-06 ***
trend_hour_5           1.222e-04  2.369e-05   5.157 3.13e-07 ***
monthAra               2.776e+00  3.181e+00   0.873   0.3830    
monthEki               3.291e+00  3.214e+00   1.024   0.3061    
monthEyl               2.167e+00  3.253e+00   0.666   0.5054    
monthHaz              -5.195e+00  4.423e+00  -1.175   0.2405    
monthKas               3.170e+00  3.191e+00   0.994   0.3207    
monthMar               2.567e+00  2.860e+00   0.898   0.3697    
monthMay               4.199e+00  3.186e+00   1.318   0.1879    
monthNis              -8.816e-01  3.015e+00  -0.292   0.7701    
monthOca               3.002e+00  2.879e+00   1.043   0.2974    
monthŞub               2.648e+00  2.868e+00   0.923   0.3561    
monthTem              -2.084e+00  3.595e+00  -0.580   0.5623    
hourly_max_t           5.028e-03  9.818e-03   0.512   0.6087    
monthAra:hourly_max_t -1.029e-02  1.122e-02  -0.917   0.3595    
monthEki:hourly_max_t -1.184e-02  1.127e-02  -1.051   0.2936    
monthEyl:hourly_max_t -7.717e-03  1.138e-02  -0.678   0.4977    
monthHaz:hourly_max_t  1.857e-02  1.551e-02   1.197   0.2315    
monthKas:hourly_max_t -1.158e-02  1.123e-02  -1.031   0.3027    
monthMar:hourly_max_t -9.429e-03  1.000e-02  -0.943   0.3460    
monthMay:hourly_max_t -1.502e-02  1.117e-02  -1.345   0.1791    
monthNis:hourly_max_t  3.273e-03  1.055e-02   0.310   0.7565    
monthOca:hourly_max_t -1.111e-02  1.008e-02  -1.103   0.2706    
monthŞub:hourly_max_t -9.825e-03  1.004e-02  -0.979   0.3280    
monthTem:hourly_max_t  7.583e-03  1.256e-02   0.604   0.5463    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.1276 on 836 degrees of freedom
Multiple R-squared:  0.6002,	Adjusted R-squared:  0.5868 
F-statistic: 44.83 on 28 and 836 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 32

data:  Residuals
LM test = 247.14, df = 32, p-value < 2.2e-16
In [178]:
#Production data for hour 5
hour_5_data <- all_data[all_data$hour == 5, ]

# Plot production for hour 4
ggplot(hour_5_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 5",
       x = "Date",
       y = "Production") +
  theme_minimal()
In [174]:
hour_5_data <- all_data %>%
  filter(hour == 5, date >= start_date & date <= end_date)

forecast_data_hour_5 <- to_be_forecasted %>%
  filter(hour == 5)
forecast_data_hour_5$trend_hour_5 <- nrow(hour_5_data) + 1:nrow(forecast_data_hour_5)

last_production_value <- tail(hour_5_data$production, 1)
forecast_data_hour_5 <- forecast_data_hour_5 %>%
  mutate(lag_1_production = last_production_value)

predictions_hour_5 <- predict(lm_hour_5, newdata = forecast_data_hour_5)
predictions_hour_5[predictions_hour_5 < 0] <- 0

predictions_hour_5_df <- data.frame(
  date = forecast_data_hour_5$date,  
  predicted = predictions_hour_5
)

merged_data <- merge(hour_5_data, predictions_hour_5_df, by = "date")

output_data <- merged_data %>%
  select(date, actual = production, predicted)

output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

print(paste("WMAPE:", wmape, "%"))

output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

file_path <- file.path(output_dir, "predictions_hour_5_with_actuals.xlsx")

write_xlsx(output_data, file_path)

print(output_data)
[1] "WMAPE: 154.103457098309 %"
Key: <date>
           date actual predicted     error
         <IDat>  <num>     <num>     <num>
  1: 2024-02-01   0.00 0.3766761 0.3766761
  2: 2024-02-02   0.00 0.3754115 0.3754115
  3: 2024-02-03   0.00 0.3770272 0.3770272
  4: 2024-02-04   0.00 0.3780127 0.3780127
  5: 2024-02-05   0.00 0.3854003 0.3854003
 ---                                      
101: 2024-05-11   0.64 0.4225375 0.2174625
102: 2024-05-12   0.00 0.4543419 0.4543419
103: 2024-05-13   0.76 0.4710763 0.2889237
104: 2024-05-14   0.96 0.4800365 0.4799635
105: 2024-05-15   0.84 0.4798892 0.3601108

As the number of data we have at hand increases, our regression models started to perform better,according to wmape levels.

In [165]:
# Filter data for hour 6
hour_6_data <- all_data[all_data$hour == 6, ]
hour_6_data <- hour_6_data[, -c(2)]
# Convert the data frame to a data.table
setDT(hour_6_data)
# Create a lagged variable for production with a lag of 1 period
hour_6_data[, lag_1_production := shift(production,1)]
hour_6_data[,lag_1_diff:=production-lag_1_production]
hour_6_data <- hour_6_data[!is.na(lag_1_production)]
hour_6_data$trend_hour_6 <- 1:nrow(hour_6_data)
# Fit linear regression model for hour 6
lm_hour_6 <- lm(production~+lag_1_production+uswrf_top_of_atmosphere+wday+ hourly_cloud_average+is.ramadan+special_period+is.religousday+month*hourly_max_t, data = hour_6_data) 
#lm_hour_6 <- lm(production ~ +uswrf_top_of_atmosphere + is.ramadan +is.weekend+ is.religousday +is.publicholiday + month*hourly_max_t , data = hour_6_data)
# Summarize the model
summary(lm_hour_6)
checkresiduals(lm_hour_6)
plot(lm_hour_6)
Call:
lm(formula = production ~ +lag_1_production + uswrf_top_of_atmosphere + 
    wday + hourly_cloud_average + is.ramadan + special_period + 
    is.religousday + month * hourly_max_t, data = hour_6_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.5222 -0.2081 -0.0312  0.1482  4.4413 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)             -1.368e+00  1.318e+01  -0.104   0.9173    
lag_1_production         4.266e-01  3.218e-02  13.256   <2e-16 ***
uswrf_top_of_atmosphere  5.519e-02  3.712e-02   1.487   0.1375    
wdayCum                 -1.306e-01  7.841e-02  -1.666   0.0962 .  
wdayÇar                  4.254e-02  7.888e-02   0.539   0.5898    
wdayPaz                 -1.408e-01  7.828e-02  -1.799   0.0724 .  
wdayPer                 -7.452e-02  7.865e-02  -0.947   0.3437    
wdayPts                 -3.051e-02  7.888e-02  -0.387   0.6990    
wdaySal                 -1.332e-01  7.870e-02  -1.693   0.0909 .  
hourly_cloud_average    -2.392e-03  9.442e-04  -2.533   0.0115 *  
is.ramadan              -1.829e-01  9.955e-02  -1.837   0.0666 .  
special_period          -7.515e-01  7.904e-02  -9.508   <2e-16 ***
is.religousday           1.018e-01  1.330e-01   0.766   0.4440    
monthAra                -3.779e+00  1.518e+01  -0.249   0.8035    
monthEki                 9.220e+00  1.524e+01   0.605   0.5453    
monthEyl                 4.213e-01  1.545e+01   0.027   0.9783    
monthHaz                 9.651e+00  2.127e+01   0.454   0.6502    
monthKas                 3.229e+00  1.518e+01   0.213   0.8316    
monthMar                -1.643e+00  1.354e+01  -0.121   0.9035    
monthMay                 2.190e+01  1.520e+01   1.441   0.1500    
monthNis                -1.766e+01  1.435e+01  -1.231   0.2187    
monthOca                -3.706e-02  1.362e+01  -0.003   0.9978    
monthŞub                -1.313e-01  1.358e+01  -0.010   0.9923    
monthTem                 1.194e+00  1.731e+01   0.069   0.9450    
hourly_max_t             8.200e-03  4.598e-02   0.178   0.8585    
monthAra:hourly_max_t    1.123e-02  5.364e-02   0.209   0.8343    
monthEki:hourly_max_t   -3.356e-02  5.350e-02  -0.627   0.5306    
monthEyl:hourly_max_t   -1.327e-03  5.413e-02  -0.025   0.9804    
monthHaz:hourly_max_t   -3.341e-02  7.440e-02  -0.449   0.6534    
monthKas:hourly_max_t   -1.354e-02  5.348e-02  -0.253   0.8002    
monthMar:hourly_max_t    4.398e-03  4.739e-02   0.093   0.9261    
monthMay:hourly_max_t   -7.962e-02  5.342e-02  -1.491   0.1365    
monthNis:hourly_max_t    6.322e-02  5.027e-02   1.258   0.2089    
monthOca:hourly_max_t   -2.352e-03  4.774e-02  -0.049   0.9607    
monthŞub:hourly_max_t   -1.929e-03  4.758e-02  -0.041   0.9677    
monthTem:hourly_max_t   -2.952e-03  6.055e-02  -0.049   0.9611    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.6133 on 829 degrees of freedom
Multiple R-squared:  0.6422,	Adjusted R-squared:  0.6271 
F-statistic: 42.52 on 35 and 829 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 39

data:  Residuals
LM test = 196.53, df = 39, p-value < 2.2e-16
In [179]:
hour_6_data <- all_data[all_data$hour == 6, ]

# Plot production for hour 6
ggplot(hour_6_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 6",
       x = "Date",
       y = "Production") +
  theme_minimal()
In [166]:
hour_6_data <- all_data %>%
  filter(hour == 6, date >= start_date & date <= end_date)

forecast_data_hour_6 <- to_be_forecasted %>%
  filter(hour == 6)
forecast_data_hour_6$trend_hour_6 <- nrow(hour_6_data) + 1:nrow(forecast_data_hour_6)

last_production_value <- tail(hour_6_data$production, 1)
forecast_data_hour_6 <- forecast_data_hour_6 %>%
  mutate(lag_1_production = last_production_value)

predictions_hour_6 <- predict(lm_hour_6, newdata = forecast_data_hour_6)
predictions_hour_6[predictions_hour_6 < 0] <- 0

predictions_hour_6_df <- data.frame(
  date = forecast_data_hour_6$date,  
  predicted = predictions_hour_6
)

merged_data <- merge(hour_6_data, predictions_hour_6_df, by = "date")

output_data <- merged_data %>%
  select(date, actual = production, predicted)

output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

print(paste("WMAPE:", wmape, "%"))

output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

file_path <- file.path(output_dir, "predictions_hour_6_with_actuals.xlsx")

write_xlsx(output_data, file_path)

print(output_data)
[1] "WMAPE: 66.2971330635574 %"
Key: <date>
           date actual predicted     error
         <IDat>  <num>     <num>     <num>
  1: 2024-02-01   0.01  1.302795 1.2927954
  2: 2024-02-02   0.01  1.290989 1.2809892
  3: 2024-02-03   0.01  1.375376 1.3653757
  4: 2024-02-04   0.07  1.148062 1.0780617
  5: 2024-02-05   0.00  1.209222 1.2092218
 ---                                      
101: 2024-05-11   2.51  1.661629 0.8483713
102: 2024-05-12   0.00  1.599317 1.5993171
103: 2024-05-13   2.69  1.814346 0.8756536
104: 2024-05-14   2.59  1.908270 0.6817298
105: 2024-05-15   2.94  2.044396 0.8956045

As we analyze hour 7, we observed significant improvement in the performance of our forecast with WMAPE level of 29.7652596829167 %. Since we have necessary amount of data, our models also improved as expected. In addition, oır residuals started to fit betteer to QQ plot of standardized residuals vs theoretical quantiles plot. Similar calculations and processes were also followed for other hours.

In [34]:
library(data.table)

# Filter data for hour 7 and remove the hour column
hour_7_data <- all_data[all_data$hour == 7, ]
hour_7_data <- hour_7_data[, -c(2)]

# Create a trend variable for hour 7
hour_7_data$trend_hour_7 <- 1:nrow(hour_7_data)

# Convert the data frame to a data.table
setDT(hour_7_data)

# Create a lagged variable for production with a lag of 1 period
hour_7_data[, lag_1_production := shift(production,1)]
hour_7_data[,lag_1_diff:=production-lag_1_production]
# Remove rows with NA in lagged production to ensure the model can run
hour_7_data <- hour_7_data[!is.na(lag_1_production)]

# Fit linear regression model for hour 7 including the lagged variable
lm_hour_7 <- lm(production ~+lag_1_production+trend_hour_7 + special_period + dlwrf_surface + hourly_cloud_average + month+hourly_max_t,data=hour_7_data)
summary(lm_hour_7)

# Check residuals
library(forecast)
checkresiduals(lm_hour_7)

# Plot the model
plot(lm_hour_7)
Call:
lm(formula = production ~ +lag_1_production + trend_hour_7 + 
    special_period + dlwrf_surface + hourly_cloud_average + month + 
    hourly_max_t, data = hour_7_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.4844 -0.6254  0.0878  0.7845  9.0720 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)          -2.009e+01  5.698e+00  -3.526 0.000444 ***
lag_1_production      2.783e-01  3.120e-02   8.918  < 2e-16 ***
trend_hour_7          6.333e-04  2.351e-04   2.693 0.007215 ** 
special_period       -5.767e-01  1.690e-01  -3.413 0.000673 ***
dlwrf_surface        -1.845e-02  4.190e-03  -4.402 1.21e-05 ***
hourly_cloud_average -8.618e-03  3.471e-03  -2.483 0.013218 *  
monthAra             -1.474e+00  3.846e-01  -3.834 0.000135 ***
monthEki              7.961e-01  3.071e-01   2.592 0.009700 ** 
monthEyl              7.637e-01  2.710e-01   2.819 0.004937 ** 
monthHaz              1.155e+00  2.637e-01   4.378 1.35e-05 ***
monthKas             -5.118e-01  3.473e-01  -1.474 0.140955    
monthMar              1.082e-01  3.813e-01   0.284 0.776751    
monthMay              5.781e-01  2.890e-01   2.001 0.045751 *  
monthNis              6.890e-01  3.066e-01   2.247 0.024893 *  
monthOca             -1.459e+00  4.083e-01  -3.575 0.000370 ***
monthŞub             -6.433e-01  4.173e-01  -1.542 0.123544    
monthTem              1.014e+00  2.601e-01   3.901 0.000104 ***
hourly_max_t          9.756e-02  2.323e-02   4.200 2.95e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.381 on 847 degrees of freedom
Multiple R-squared:  0.6439,	Adjusted R-squared:  0.6367 
F-statistic: 90.08 on 17 and 847 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 21

data:  Residuals
LM test = 65.049, df = 21, p-value = 2.136e-06
In [180]:
# Plot production for hour 7
ggplot(hour_7_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 7",
       x = "Date",
       y = "Production") +
  theme_minimal()
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
In [156]:
hour_7_data <- all_data %>%
  filter(hour == 7, date >= start_date & date <= end_date)

forecast_data_hour_7 <- to_be_forecasted %>%
  filter(hour == 7)
forecast_data_hour_7$trend_hour_7 <- nrow(hour_7_data) + 1:nrow(forecast_data_hour_7)

last_production_value <- tail(hour_7_data$production, 1)
forecast_data_hour_7 <- forecast_data_hour_7 %>%
  mutate(lag_1_production = last_production_value)

predictions_hour_7 <- predict(lm_hour_7, newdata = forecast_data_hour_7)
predictions_hour_7[predictions_hour_7 < 0] <- 0

predictions_hour_7_df <- data.frame(
  date = forecast_data_hour_7$date,  
  predicted = predictions_hour_7
)

merged_data <- merge(hour_7_data, predictions_hour_7_df, by = "date")

output_data <- merged_data %>%
  select(date, actual = production, predicted)

output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

print(paste("WMAPE:", wmape, "%"))

output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

file_path <- file.path(output_dir, "predictions_hour_7_with_actuals.xlsx")

write_xlsx(output_data, file_path)

print(output_data)
[1] "WMAPE: 29.7652596829167 %"
Key: <date>
           date actual predicted     error
         <IDat>  <num>     <num>     <num>
  1: 2024-02-01   1.59  3.143221 1.5532210
  2: 2024-02-02   1.62  3.556651 1.9366508
  3: 2024-02-03   0.94  3.054084 2.1140840
  4: 2024-02-04   1.03  2.487699 1.4576990
  5: 2024-02-05   0.88  1.745296 0.8652960
 ---                                      
101: 2024-05-11   6.27  4.955730 1.3142697
102: 2024-05-12   0.00  3.903619 3.9036191
103: 2024-05-13   5.75  3.460671 2.2893287
104: 2024-05-14   4.14  3.880553 0.2594466
105: 2024-05-15   6.58  4.327481 2.2525187
In [39]:
# Filter data for hour 8
hour_8_data <- all_data[all_data$hour == 8, ]
hour_8_data <- hour_8_data[,-c(2)]
hour_8_data$trend_hour_8 <- 1:nrow(hour_8_data)
# Create a lagged variable for production with a lag of 1 period
hour_8_data[, lag_1_production := shift(production,1)]
hour_8_data[,lag_1_diff:=production-lag_1_production]
# Remove rows with NA in lagged production to ensure the model can run
hour_8_data <- hour_8_data[!is.na(lag_1_production)]
#Fit linear regression model for hour 8
lm_hour_8 <- lm(production ~ +lag_1_production+trend_hour_8+ csnow_surface+dlwrf_surface+hourly_cloud_average+is.ramadan+month*hourly_max_t, data = hour_8_data)
# Summarize the model
summary(lm_hour_8)

checkresiduals(lm_hour_8)
plot(lm_hour_8)
Call:
lm(formula = production ~ +lag_1_production + trend_hour_8 + 
    csnow_surface + dlwrf_surface + hourly_cloud_average + is.ramadan + 
    month * hourly_max_t, data = hour_8_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.7347 -0.9508  0.2114  1.0988  5.7171 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            3.007e+00  3.530e+01   0.085  0.93214    
lag_1_production       1.645e-01  2.916e-02   5.640 2.33e-08 ***
trend_hour_8           1.089e-03  2.785e-04   3.910 9.96e-05 ***
csnow_surface         -1.217e+00  4.449e-01  -2.736  0.00636 ** 
dlwrf_surface         -2.800e-02  5.213e-03  -5.373 1.01e-07 ***
hourly_cloud_average  -2.446e-02  4.726e-03  -5.174 2.86e-07 ***
is.ramadan            -6.163e-01  3.035e-01  -2.031  0.04259 *  
monthAra              -6.920e+01  4.174e+01  -1.658  0.09774 .  
monthEki              -2.122e+01  4.112e+01  -0.516  0.60601    
monthEyl               2.384e+01  4.017e+01   0.593  0.55305    
monthHaz              -1.170e+01  4.634e+01  -0.252  0.80072    
monthKas              -2.417e+01  4.048e+01  -0.597  0.55073    
monthMar              -3.480e+01  3.654e+01  -0.952  0.34121    
monthMay              -5.104e+00  3.892e+01  -0.131  0.89569    
monthNis              -1.825e+01  3.752e+01  -0.486  0.62684    
monthOca              -4.946e+01  3.652e+01  -1.354  0.17605    
monthŞub              -3.804e+01  3.634e+01  -1.047  0.29550    
monthTem              -1.275e+01  4.553e+01  -0.280  0.77950    
hourly_max_t           3.750e-02  1.191e-01   0.315  0.75291    
monthAra:hourly_max_t  2.434e-01  1.438e-01   1.692  0.09107 .  
monthEki:hourly_max_t  7.415e-02  1.398e-01   0.531  0.59584    
monthEyl:hourly_max_t -8.233e-02  1.353e-01  -0.608  0.54312    
monthHaz:hourly_max_t  4.065e-02  1.564e-01   0.260  0.79500    
monthKas:hourly_max_t  8.094e-02  1.383e-01   0.585  0.55858    
monthMar:hourly_max_t  1.242e-01  1.233e-01   1.007  0.31409    
monthMay:hourly_max_t  1.699e-02  1.313e-01   0.129  0.89707    
monthNis:hourly_max_t  6.320e-02  1.264e-01   0.500  0.61721    
monthOca:hourly_max_t  1.710e-01  1.235e-01   1.385  0.16638    
monthŞub:hourly_max_t  1.333e-01  1.227e-01   1.086  0.27767    
monthTem:hourly_max_t  4.460e-02  1.531e-01   0.291  0.77096    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.818 on 835 degrees of freedom
Multiple R-squared:  0.5892,	Adjusted R-squared:  0.5749 
F-statistic: 41.29 on 29 and 835 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 33

data:  Residuals
LM test = 68.172, df = 33, p-value = 0.0003044
In [182]:
# Plot production for hour 8
ggplot(hour_8_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 8",
       x = "Date",
       y = "Production") +
  theme_minimal()
In [155]:
hour_8_data <- all_data %>%
  filter(hour == 8, date >= start_date & date <= end_date)

forecast_data_hour_8 <- to_be_forecasted %>%
  filter(hour == 8)
forecast_data_hour_8$trend_hour_8 <- nrow(hour_8_data) + 1:nrow(forecast_data_hour_8)

last_production_value <- tail(hour_8_data$production, 1)
forecast_data_hour_8 <- forecast_data_hour_8 %>%
  mutate(lag_1_production = last_production_value)

predictions_hour_8 <- predict(lm_hour_8, newdata = forecast_data_hour_8)
predictions_hour_8[predictions_hour_8 < 0] <- 0

predictions_hour_8_df <- data.frame(
  date = forecast_data_hour_8$date,  
  predicted = predictions_hour_8
)

merged_data <- merge(hour_8_data, predictions_hour_8_df, by = "date")

output_data <- merged_data %>%
  select(date, actual = production, predicted)

output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

print(paste("WMAPE:", wmape, "%"))

output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

file_path <- file.path(output_dir, "predictions_hour_8_with_actuals.xlsx")

write_xlsx(output_data, file_path)

print(output_data)
[1] "WMAPE: 22.6917817588993 %"
Key: <date>
           date actual predicted     error
         <IDat>  <num>     <num>     <num>
  1: 2024-02-01   6.17  5.694100 0.4759005
  2: 2024-02-02   6.15  6.627810 0.4778098
  3: 2024-02-03   4.21  5.471073 1.2610726
  4: 2024-02-04   5.04  4.426163 0.6138374
  5: 2024-02-05   3.01  2.770403 0.2395970
 ---                                      
101: 2024-05-11   9.07  7.128347 1.9416530
102: 2024-05-12   0.00  5.323364 5.3233640
103: 2024-05-13   9.24  4.310057 4.9299429
104: 2024-05-14   7.60  5.473529 2.1264705
105: 2024-05-15   9.13  5.993096 3.1369043

For hour 8, although there are still higher instances of lags in the residual ACF plot, no additional lag parameter was added to prevent multicollinearity.Similar assumption was also followed with other hours.

In [44]:
# Filter data for hour 9
hour_9_data <- all_data[all_data$hour == 9, ]
hour_9_data <- hour_9_data[,-c(2)]
hour_9_data$trend_hour_9 <- 1:nrow(hour_9_data)
hour_9_data[, lag_1_production := shift(production,1)]
hour_9_data[,lag_1_diff:=production-lag_1_production]
# Remove rows with NA in lagged production to ensure the model can run
hour_9_data <- hour_9_data[!is.na(lag_1_production)]
# Fit linear regression model for hour 9
lm_hour_9 <- lm(production ~+lag_1_production+special_period+dswrf_surface+csnow_surface+dlwrf_surface+is.nationalday+uswrf_surface+hourly_cloud_average+month*hourly_max_t, data = hour_9_data)
# Summarize the model
summary(lm_hour_9)
checkresiduals(lm_hour_9)
plot(lm_hour_9)
Call:
lm(formula = production ~ +lag_1_production + special_period + 
    dswrf_surface + csnow_surface + dlwrf_surface + is.nationalday + 
    uswrf_surface + hourly_cloud_average + month * hourly_max_t, 
    data = hour_9_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.3796 -1.0059  0.2701  1.2028  5.2072 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)           -2.673e+01  3.908e+01  -0.684  0.49423    
lag_1_production       8.335e-02  2.831e-02   2.945  0.00332 ** 
special_period        -3.158e-01  2.134e-01  -1.480  0.13923    
dswrf_surface         -3.688e-02  1.474e-02  -2.502  0.01255 *  
csnow_surface         -1.383e+00  5.236e-01  -2.641  0.00841 ** 
dlwrf_surface         -3.795e-02  6.159e-03  -6.161 1.12e-09 ***
is.nationalday         1.015e+00  5.652e-01   1.796  0.07284 .  
uswrf_surface          1.028e-01  4.429e-02   2.320  0.02058 *  
hourly_cloud_average  -2.812e-02  5.303e-03  -5.304 1.45e-07 ***
monthAra              -1.102e+02  4.719e+01  -2.334  0.01982 *  
monthEki              -3.741e+01  4.523e+01  -0.827  0.40833    
monthEyl               2.276e+01  4.286e+01   0.531  0.59555    
monthHaz              -2.816e+01  4.784e+01  -0.589  0.55618    
monthKas              -1.590e+01  4.274e+01  -0.372  0.70990    
monthMar              -3.979e+01  4.214e+01  -0.944  0.34530    
monthMay              -1.751e+01  4.198e+01  -0.417  0.67663    
monthNis              -1.645e+01  4.081e+01  -0.403  0.68701    
monthOca              -7.982e+01  4.042e+01  -1.975  0.04860 *  
monthŞub              -2.391e+01  4.025e+01  -0.594  0.55266    
monthTem               2.289e+01  4.695e+01   0.488  0.62600    
hourly_max_t           1.581e-01  1.299e-01   1.217  0.22396    
monthAra:hourly_max_t  3.981e-01  1.604e-01   2.481  0.01329 *  
monthEki:hourly_max_t  1.316e-01  1.506e-01   0.874  0.38220    
monthEyl:hourly_max_t -7.465e-02  1.411e-01  -0.529  0.59699    
monthHaz:hourly_max_t  1.012e-01  1.584e-01   0.639  0.52314    
monthKas:hourly_max_t  5.577e-02  1.424e-01   0.392  0.69550    
monthMar:hourly_max_t  1.448e-01  1.402e-01   1.032  0.30227    
monthMay:hourly_max_t  6.412e-02  1.388e-01   0.462  0.64420    
monthNis:hourly_max_t  5.935e-02  1.347e-01   0.441  0.65955    
monthOca:hourly_max_t  2.893e-01  1.342e-01   2.156  0.03139 *  
monthŞub:hourly_max_t  8.869e-02  1.334e-01   0.665  0.50621    
monthTem:hourly_max_t -7.121e-02  1.546e-01  -0.461  0.64511    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.981 on 833 degrees of freedom
Multiple R-squared:  0.5639,	Adjusted R-squared:  0.5476 
F-statistic: 34.74 on 31 and 833 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 35

data:  Residuals
LM test = 20.373, df = 35, p-value = 0.9769
In [183]:
# Plot production for hour 9
ggplot(hour_9_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 9",
       x = "Date",
       y = "Production") +
  theme_minimal()
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
In [154]:
hour_9_data <- all_data %>%
  filter(hour == 9, date >= start_date & date <= end_date)

forecast_data_hour_9 <- to_be_forecasted %>%
  filter(hour == 9)
forecast_data_hour_9$trend_hour_9 <- nrow(hour_9_data) + 1:nrow(forecast_data_hour_9)

last_production_value <- tail(hour_9_data$production, 1)
forecast_data_hour_9 <- forecast_data_hour_9 %>%
  mutate(lag_1_production = last_production_value)

predictions_hour_9 <- predict(lm_hour_9, newdata = forecast_data_hour_9)
predictions_hour_9[predictions_hour_9 < 0] <- 0

predictions_hour_9_df <- data.frame(
  date = forecast_data_hour_9$date,  
  predicted = predictions_hour_9
)

merged_data <- merge(hour_9_data, predictions_hour_9_df, by = "date")

output_data <- merged_data %>%
  select(date, actual = production, predicted)

output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

print(paste("WMAPE:", wmape, "%"))

output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

file_path <- file.path(output_dir, "predictions_hour_9_with_actuals.xlsx")

write_xlsx(output_data, file_path)

print(output_data)
[1] "WMAPE: 19.5185218600221 %"
Key: <date>
           date actual predicted     error
         <IDat>  <num>     <num>     <num>
  1: 2024-02-01   7.11  8.060603 0.9506034
  2: 2024-02-02   9.03  9.322309 0.2923085
  3: 2024-02-03   6.83  7.574831 0.7448312
  4: 2024-02-04   9.02  6.479980 2.5400196
  5: 2024-02-05   4.21  4.660045 0.4500446
 ---                                      
101: 2024-05-11   9.85  8.116059 1.7339413
102: 2024-05-12   0.00  6.053854 6.0538535
103: 2024-05-13   8.88  4.573623 4.3063768
104: 2024-05-14   8.83  5.654152 3.1758482
105: 2024-05-15   9.82  6.814517 3.0054834
In [117]:
# Create a trend variable for hour 10
hour_10_data <- all_data[all_data$hour == 10, ]
hour_10_data <- hour_10_data[,-c(2)]
hour_10_data$trend_hour_10 <- 1:nrow(hour_10_data)
hour_10_data[, lag_1_production := shift(production,1)]
hour_10_data[,lag_1_diff:=production-lag_1_production]
# Remove rows with NA in lagged production to ensure the model can run
hour_10_data <- hour_10_data[!is.na(lag_1_production)]
# Fit linear regression model for hour 10
lm_hour_10 <- lm(production ~+lag_1_production +csnow_surface+dlwrf_surface+hourly_cloud_average+is.weekend+is.religousday+is.nationalday+month*hourly_max_t, data = hour_10_data)
summary(lm_hour_10)

checkresiduals(lm_hour_10)
plot(lm_hour_10)

pacf(hour_10_data$production)
Call:
lm(formula = production ~ +lag_1_production + csnow_surface + 
    dlwrf_surface + hourly_cloud_average + is.weekend + is.religousday + 
    is.nationalday + month * hourly_max_t, data = hour_10_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.8962 -0.6891  0.2512  1.0714  5.4454 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)           -2.762e+01  3.911e+01  -0.706  0.48024    
lag_1_production       7.410e-02  2.789e-02   2.657  0.00804 ** 
csnow_surface         -2.278e+00  5.748e-01  -3.964 8.01e-05 ***
dlwrf_surface         -3.159e-02  4.756e-03  -6.643 5.55e-11 ***
hourly_cloud_average  -2.114e-02  5.198e-03  -4.067 5.21e-05 ***
is.weekend            -1.860e-01  1.552e-01  -1.198  0.23113    
is.religousday         8.104e-01  4.378e-01   1.851  0.06450 .  
is.nationalday         7.260e-01  5.847e-01   1.242  0.21470    
monthAra              -1.330e+02  4.784e+01  -2.780  0.00556 ** 
monthEki              -2.118e+01  4.538e+01  -0.467  0.64071    
monthEyl               3.466e+00  4.299e+01   0.081  0.93575    
monthHaz              -4.052e+01  4.656e+01  -0.870  0.38434    
monthKas              -1.036e+01  4.223e+01  -0.245  0.80617    
monthMar               1.790e+01  4.028e+01   0.445  0.65679    
monthMay              -3.094e+00  4.168e+01  -0.074  0.94084    
monthNis               3.873e+00  4.079e+01   0.095  0.92437    
monthOca              -7.934e+01  4.127e+01  -1.922  0.05490 .  
monthŞub               1.043e+01  4.058e+01   0.257  0.79716    
monthTem               1.904e+01  4.619e+01   0.412  0.68038    
hourly_max_t           1.517e-01  1.266e-01   1.199  0.23102    
monthAra:hourly_max_t  4.782e-01  1.600e-01   2.989  0.00288 ** 
monthEki:hourly_max_t  7.438e-02  1.483e-01   0.501  0.61616    
monthEyl:hourly_max_t -1.009e-02  1.390e-01  -0.073  0.94216    
monthHaz:hourly_max_t  1.358e-01  1.514e-01   0.897  0.36997    
monthKas:hourly_max_t  3.748e-02  1.378e-01   0.272  0.78568    
monthMar:hourly_max_t -5.926e-02  1.306e-01  -0.454  0.65010    
monthMay:hourly_max_t  1.042e-02  1.351e-01   0.077  0.93855    
monthNis:hourly_max_t -1.247e-02  1.320e-01  -0.094  0.92475    
monthOca:hourly_max_t  2.886e-01  1.349e-01   2.140  0.03267 *  
monthŞub:hourly_max_t -3.244e-02  1.320e-01  -0.246  0.80598    
monthTem:hourly_max_t -6.103e-02  1.496e-01  -0.408  0.68333    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.049 on 834 degrees of freedom
Multiple R-squared:  0.5422,	Adjusted R-squared:  0.5257 
F-statistic: 32.92 on 30 and 834 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 34

data:  Residuals
LM test = 72.303, df = 34, p-value = 0.0001407
In [184]:
# Plot production for hour 10
ggplot(hour_10_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 10",
       x = "Date",
       y = "Production") +
  theme_minimal()
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
In [153]:
hour_10_data <- all_data %>%
  filter(hour == 10, date >= start_date & date <= end_date)

forecast_data_hour_10 <- to_be_forecasted %>%
  filter(hour == 10)
forecast_data_hour_10$trend_hour_10 <- nrow(hour_10_data) + 1:nrow(forecast_data_hour_10)

last_production_value <- tail(hour_10_data$production, 1)
forecast_data_hour_10 <- forecast_data_hour_10 %>%
  mutate(lag_1_production = last_production_value)

predictions_hour_10 <- predict(lm_hour_10, newdata = forecast_data_hour_10)
predictions_hour_10[predictions_hour_10 < 0] <- 0

predictions_hour_10_df <- data.frame(
  date = forecast_data_hour_10$date,  
  predicted = predictions_hour_10
)

merged_data <- merge(hour_10_data, predictions_hour_10_df, by = "date")

output_data <- merged_data %>%
  select(date, actual = production, predicted)

output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

print(paste("WMAPE:", wmape, "%"))

output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

file_path <- file.path(output_dir, "predictions_hour_10_with_actuals.xlsx")

write_xlsx(output_data, file_path)

print(output_data)
[1] "WMAPE: 17.1336671941216 %"
Key: <date>
           date actual predicted      error
         <IDat>  <num>     <num>      <num>
  1: 2024-02-01   5.20  9.103181 3.90318118
  2: 2024-02-02   9.85  9.791868 0.05813237
  3: 2024-02-03   6.11  7.154666 1.04466562
  4: 2024-02-04   9.83  7.315851 2.51414880
  5: 2024-02-05   6.69  6.172011 0.51798898
 ---                                       
101: 2024-05-11   9.54  8.404196 1.13580386
102: 2024-05-12   0.00  6.531169 6.53116888
103: 2024-05-13   7.85  4.705113 3.14488672
104: 2024-05-14   7.44  7.697921 0.25792078
105: 2024-05-15   9.87  8.350045 1.51995509
In [121]:
# Create a trend variable for hour 11
hour_11_data <- all_data[all_data$hour == 11, ]
hour_11_data <- hour_11_data[,-c(2)]
hour_11_data$trend_hour_11 <- 1:nrow(hour_11_data)

hour_11_data[, lag_19_production := shift(production,19)]
hour_11_data[,lag_19_diff:=production-lag_19_production]
# Remove rows with NA in lagged production to ensure the model can run
hour_11_data <- hour_11_data[!is.na(lag_19_production)]
# Fit linear regression model for hour 11
lm_hour_11 <- lm(production ~+lag_19_production+trend_hour_11+special_period+csnow_surface+dlwrf_surface+tmp_surface+hourly_cloud_average+is.weekend+is.religousday+is.nationalday+month*hourly_max_t, data = hour_11_data)
summary(lm_hour_11)
checkresiduals(lm_hour_11)
plot(lm_hour_11)
Call:
lm(formula = production ~ +lag_19_production + trend_hour_11 + 
    special_period + csnow_surface + dlwrf_surface + tmp_surface + 
    hourly_cloud_average + is.weekend + is.religousday + is.nationalday + 
    month * hourly_max_t, data = hour_11_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.7618 -0.6632  0.2996  1.0666  6.7623 

Coefficients: (1 not defined because of singularities)
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)           -7.716e+01  3.824e+01  -2.018  0.04394 *  
lag_19_production     -8.089e-02  2.593e-02  -3.120  0.00187 ** 
trend_hour_11         -5.759e-04  3.821e-04  -1.507  0.13214    
special_period         3.920e-01  2.550e-01   1.537  0.12472    
csnow_surface         -1.899e+00  5.783e-01  -3.285  0.00106 ** 
dlwrf_surface         -3.286e-02  4.517e-03  -7.276 8.12e-13 ***
tmp_surface            3.146e-01  1.221e-01   2.577  0.01014 *  
hourly_cloud_average  -1.540e-02  5.328e-03  -2.891  0.00394 ** 
is.weekend            -1.661e-01  1.554e-01  -1.069  0.28525    
is.religousday         1.466e-01  4.444e-01   0.330  0.74160    
is.nationalday         2.450e-01  5.779e-01   0.424  0.67173    
monthAra              -7.152e+01  4.482e+01  -1.596  0.11094    
monthEki               3.889e+01  4.401e+01   0.884  0.37711    
monthEyl               3.329e+01  4.175e+01   0.797  0.42550    
monthHaz               3.922e+01  4.427e+01   0.886  0.37591    
monthKas               1.830e+01  4.073e+01   0.449  0.65327    
monthMar               6.715e+01  3.935e+01   1.707  0.08828 .  
monthMay               4.722e+01  4.049e+01   1.166  0.24381    
monthNis               4.275e+01  3.974e+01   1.076  0.28237    
monthOca              -3.682e+01  4.078e+01  -0.903  0.36678    
monthŞub               7.621e+01  3.968e+01   1.921  0.05513 .  
monthTem               4.128e+01  4.461e+01   0.925  0.35500    
hourly_max_t                  NA         NA      NA       NA    
monthAra:hourly_max_t  2.730e-01  1.471e-01   1.856  0.06384 .  
monthEki:hourly_max_t -1.202e-01  1.419e-01  -0.847  0.39706    
monthEyl:hourly_max_t -1.036e-01  1.332e-01  -0.778  0.43658    
monthHaz:hourly_max_t -1.240e-01  1.421e-01  -0.873  0.38299    
monthKas:hourly_max_t -4.877e-02  1.309e-01  -0.372  0.70962    
monthMar:hourly_max_t -2.166e-01  1.259e-01  -1.721  0.08572 .  
monthMay:hourly_max_t -1.520e-01  1.295e-01  -1.174  0.24087    
monthNis:hourly_max_t -1.345e-01  1.269e-01  -1.059  0.28970    
monthOca:hourly_max_t  1.516e-01  1.318e-01   1.150  0.25041    
monthŞub:hourly_max_t -2.490e-01  1.274e-01  -1.955  0.05098 .  
monthTem:hourly_max_t -1.296e-01  1.426e-01  -0.909  0.36342    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.03 on 814 degrees of freedom
Multiple R-squared:  0.5386,	Adjusted R-squared:  0.5204 
F-statistic: 29.69 on 32 and 814 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 37

data:  Residuals
LM test = 59.296, df = 37, p-value = 0.01143
In [185]:
# Plot production for hour 11
ggplot(hour_11_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 11",
       x = "Date",
       y = "Production") +
  theme_minimal()
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
In [152]:
hour_11_data <- all_data %>%
  filter(hour == 11, date >= start_date & date <= end_date)

forecast_data_hour_11 <- to_be_forecasted %>%
  filter(hour == 11)
forecast_data_hour_11$trend_hour_11 <- nrow(hour_11_data) + 1:nrow(forecast_data_hour_11)

last_production_value <- tail(hour_11_data$production, 1)
forecast_data_hour_11 <- forecast_data_hour_11 %>%
  mutate(lag_19_production = last_production_value)

predictions_hour_11 <- predict(lm_hour_11, newdata = forecast_data_hour_11)
predictions_hour_11[predictions_hour_11 < 0] <- 0

predictions_hour_11_df <- data.frame(
  date = forecast_data_hour_11$date,  
  predicted = predictions_hour_11
)

merged_data <- merge(hour_11_data, predictions_hour_11_df, by = "date")

output_data <- merged_data %>%
  select(date, actual = production, predicted)

output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

print(paste("WMAPE:", wmape, "%"))

output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

file_path <- file.path(output_dir, "predictions_hour_11_with_actuals.xlsx")

write_xlsx(output_data, file_path)

print(output_data)
[1] "WMAPE: 19.6298861504385 %"
Key: <date>
           date actual predicted     error
         <IDat>  <num>     <num>     <num>
  1: 2024-02-01   3.53  9.113857 5.5838567
  2: 2024-02-02   9.88  9.598790 0.2812103
  3: 2024-02-03   9.09  7.347482 1.7425182
  4: 2024-02-04   6.96  7.403292 0.4432921
  5: 2024-02-05   4.12  5.898651 1.7786511
 ---                                      
101: 2024-05-11   5.76  7.810759 2.0507586
102: 2024-05-12   0.00  5.750518 5.7505177
103: 2024-05-13   8.36  4.728926 3.6310744
104: 2024-05-14   5.84  7.389215 1.5492146
105: 2024-05-15   9.82  8.429268 1.3907323
In [123]:
# Create a trend variable for hour 12
hour_12_data <- all_data[all_data$hour == 12, ]
hour_12_data <- hour_12_data[,-c(2)]
hour_12_data[, lag_1_production := shift(production,1)]
hour_12_data[,lag_1_diff:=production-lag_1_production]
hour_12_data <- hour_12_data[!is.na(lag_1_production)]
hour_12_data$trend_hour_12 <- 1:nrow(hour_12_data)
lm_hour_12 <- lm(production ~ +lag_1_production+trend_hour_12+csnow_surface+dlwrf_surface+tmp_surface+hourly_cloud_average+is.weekend+is.religousday+is.nationalday+month*hourly_max_t , data = hour_12_data)
summary(lm_hour_12)
checkresiduals(lm_hour_12)
plot(lm_hour_12)
Call:
lm(formula = production ~ +lag_1_production + trend_hour_12 + 
    csnow_surface + dlwrf_surface + tmp_surface + hourly_cloud_average + 
    is.weekend + is.religousday + is.nationalday + month * hourly_max_t, 
    data = hour_12_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.5394 -0.7375  0.2502  1.1136  6.5160 

Coefficients: (1 not defined because of singularities)
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)           -4.327e+01  3.850e+01  -1.124  0.26137    
lag_1_production       9.071e-02  2.827e-02   3.209  0.00138 ** 
trend_hour_12         -7.090e-04  3.195e-04  -2.219  0.02677 *  
csnow_surface         -1.375e+00  5.384e-01  -2.554  0.01081 *  
dlwrf_surface         -3.082e-02  4.333e-03  -7.113 2.44e-12 ***
tmp_surface            2.000e-01  1.218e-01   1.641  0.10110    
hourly_cloud_average  -1.622e-02  5.297e-03  -3.062  0.00227 ** 
is.weekend            -3.110e-01  1.531e-01  -2.031  0.04257 *  
is.religousday         1.213e-01  4.404e-01   0.275  0.78305    
is.nationalday         3.109e-01  5.763e-01   0.540  0.58969    
monthAra              -4.290e+01  4.372e+01  -0.981  0.32671    
monthEki               5.402e+00  4.385e+01   0.123  0.90198    
monthEyl               1.075e+01  4.171e+01   0.258  0.79661    
monthHaz               1.580e+01  4.355e+01   0.363  0.71694    
monthKas              -2.455e+00  4.062e+01  -0.060  0.95183    
monthMar               3.075e+01  3.939e+01   0.781  0.43513    
monthMay               3.139e+01  4.056e+01   0.774  0.43920    
monthNis               1.588e+01  3.993e+01   0.398  0.69096    
monthOca              -4.011e+01  4.033e+01  -0.995  0.32026    
monthŞub               4.898e+01  3.964e+01   1.235  0.21701    
monthTem               3.862e+01  4.401e+01   0.878  0.38042    
hourly_max_t                  NA         NA      NA       NA    
monthAra:hourly_max_t  1.574e-01  1.416e-01   1.112  0.26663    
monthEki:hourly_max_t -1.524e-02  1.401e-01  -0.109  0.91336    
monthEyl:hourly_max_t -3.368e-02  1.319e-01  -0.255  0.79852    
monthHaz:hourly_max_t -5.101e-02  1.386e-01  -0.368  0.71283    
monthKas:hourly_max_t  1.198e-02  1.293e-01   0.093  0.92622    
monthMar:hourly_max_t -9.976e-02  1.248e-01  -0.799  0.42442    
monthMay:hourly_max_t -1.038e-01  1.286e-01  -0.807  0.41968    
monthNis:hourly_max_t -5.067e-02  1.265e-01  -0.401  0.68879    
monthOca:hourly_max_t  1.482e-01  1.288e-01   1.151  0.25025    
monthŞub:hourly_max_t -1.658e-01  1.261e-01  -1.316  0.18864    
monthTem:hourly_max_t -1.230e-01  1.394e-01  -0.882  0.37792    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.019 on 833 degrees of freedom
Multiple R-squared:  0.5384,	Adjusted R-squared:  0.5212 
F-statistic: 31.34 on 31 and 833 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 36

data:  Residuals
LM test = 32.413, df = 36, p-value = 0.6399
In [186]:
ggplot(hour_12_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 12",
       x = "Date",
       y = "Production") +
  theme_minimal()
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
In [149]:
hour_12_data <- all_data %>%
  filter(hour == 12, date >= start_date & date <= end_date)

forecast_data_hour_12 <- to_be_forecasted %>%
  filter(hour == 12)
forecast_data_hour_12$trend_hour_12 <- nrow(hour_12_data) + 1:nrow(forecast_data_hour_12)

last_production_value <- tail(hour_12_data$production, 1)
forecast_data_hour_12 <- forecast_data_hour_12 %>%
  mutate(lag_1_production = last_production_value)

predictions_hour_12 <- predict(lm_hour_12, newdata = forecast_data_hour_12)
predictions_hour_12[predictions_hour_12 < 0] <- 0

predictions_hour_12_df <- data.frame(
  date = forecast_data_hour_12$date,  
  predicted = predictions_hour_12
)

merged_data <- merge(hour_12_data, predictions_hour_12_df, by = "date")

output_data <- merged_data %>%
  select(date, actual = production, predicted)

output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

print(paste("WMAPE:", wmape, "%"))

output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

file_path <- file.path(output_dir, "predictions_hour_12_with_actuals.xlsx")

write_xlsx(output_data, file_path)

print(output_data)
[1] "WMAPE: 24.145872036254 %"
Key: <date>
           date actual predicted     error
         <IDat>  <num>     <num>     <num>
  1: 2024-02-01   4.92  9.065805 4.1458054
  2: 2024-02-02   9.85  9.499731 0.3502688
  3: 2024-02-03   8.71  7.153421 1.5565785
  4: 2024-02-04   9.19  7.395700 1.7942998
  5: 2024-02-05   2.85  5.925434 3.0754340
 ---                                      
101: 2024-05-11   2.25  8.158062 5.9080618
102: 2024-05-12   0.00  5.995392 5.9953918
103: 2024-05-13   5.34  5.967664 0.6276639
104: 2024-05-14   7.78  8.504438 0.7244383
105: 2024-05-15   7.92  8.874813 0.9548128
In [147]:
# Create a trend variable for hour 13
hour_13_data <- all_data[all_data$hour == 13, ]
hour_13_data <- hour_13_data[,-c(2)]

hour_13_data[, lag_1_production := shift(production,1)]
hour_13_data[,lag_1_diff:=production-lag_1_production]
# Remove rows with NA in lagged production to ensure the model can run
hour_13_data <- hour_13_data[!is.na(lag_1_production)]
# Remove rows with missing values in the hourly_max_t column
hour_13_data$trend_hour_13 <- 1:nrow(hour_13_data)
lm_hour_13 <- lm(production ~+lag_1_production +trend_hour_13+dlwrf_surface+tmp_surface+hourly_cloud_average+is.weekend+is.nationalday+is.publicholiday+month * hourly_max_t , data = hour_13_data)
summary(lm_hour_13)
checkresiduals(lm_hour_13)
plot(lm_hour_13)
Call:
lm(formula = production ~ +lag_1_production + trend_hour_13 + 
    dlwrf_surface + tmp_surface + hourly_cloud_average + is.weekend + 
    is.nationalday + is.publicholiday + month * hourly_max_t, 
    data = hour_13_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.7178 -0.9401  0.3124  1.2178  6.2601 

Coefficients: (1 not defined because of singularities)
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)           -8.568e+01  3.993e+01  -2.146 0.032188 *  
lag_1_production       9.508e-02  2.783e-02   3.416 0.000666 ***
trend_hour_13         -6.315e-04  3.237e-04  -1.951 0.051391 .  
dlwrf_surface         -3.221e-02  4.352e-03  -7.401 3.29e-13 ***
tmp_surface            3.333e-01  1.259e-01   2.647 0.008275 ** 
hourly_cloud_average  -2.541e-02  5.556e-03  -4.573 5.54e-06 ***
is.weekend             3.282e+00  1.184e+00   2.773 0.005670 ** 
is.nationalday         1.910e+00  8.175e-01   2.336 0.019730 *  
is.publicholiday      -3.571e+00  1.194e+00  -2.991 0.002866 ** 
monthAra               1.945e+01  4.467e+01   0.435 0.663442    
monthEki               7.421e+01  4.535e+01   1.636 0.102127    
monthEyl               6.383e+01  4.335e+01   1.472 0.141287    
monthHaz               6.102e+01  4.485e+01   1.360 0.174059    
monthKas               3.433e+01  4.208e+01   0.816 0.414872    
monthMar               6.470e+01  4.067e+01   1.591 0.112032    
monthMay               8.486e+01  4.199e+01   2.021 0.043580 *  
monthNis               4.363e+01  4.147e+01   1.052 0.293125    
monthOca               1.388e+01  4.160e+01   0.334 0.738782    
monthŞub               8.158e+01  4.100e+01   1.990 0.046951 *  
monthTem               7.787e+01  4.461e+01   1.746 0.081257 .  
hourly_max_t                  NA         NA      NA       NA    
monthAra:hourly_max_t -4.841e-02  1.438e-01  -0.337 0.736503    
monthEki:hourly_max_t -2.372e-01  1.444e-01  -1.642 0.100914    
monthEyl:hourly_max_t -2.004e-01  1.367e-01  -1.467 0.142840    
monthHaz:hourly_max_t -1.920e-01  1.423e-01  -1.350 0.177469    
monthKas:hourly_max_t -1.038e-01  1.335e-01  -0.777 0.437101    
monthMar:hourly_max_t -2.034e-01  1.284e-01  -1.584 0.113635    
monthMay:hourly_max_t -2.722e-01  1.328e-01  -2.050 0.040671 *  
monthNis:hourly_max_t -1.350e-01  1.310e-01  -1.031 0.302951    
monthOca:hourly_max_t -2.678e-02  1.323e-01  -0.202 0.839604    
monthŞub:hourly_max_t -2.628e-01  1.298e-01  -2.024 0.043250 *  
monthTem:hourly_max_t -2.451e-01  1.408e-01  -1.741 0.082065 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.095 on 834 degrees of freedom
Multiple R-squared:  0.5576,	Adjusted R-squared:  0.5417 
F-statistic: 35.05 on 30 and 834 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 35

data:  Residuals
LM test = 56.897, df = 35, p-value = 0.01108
In [187]:
ggplot(hour_13_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 13",
       x = "Date",
       y = "Production") +
  theme_minimal()
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
In [148]:
hour_13_data <- all_data %>%
  filter(hour == 13, date >= start_date & date <= end_date)

forecast_data_hour_13 <- to_be_forecasted %>%
  filter(hour == 13)
forecast_data_hour_13$trend_hour_13 <- nrow(hour_13_data) + 1:nrow(forecast_data_hour_13)

last_production_value <- tail(hour_13_data$production, 1)
forecast_data_hour_13 <- forecast_data_hour_13 %>%
  mutate(lag_1_production = last_production_value)

predictions_hour_13 <- predict(lm_hour_13, newdata = forecast_data_hour_13)
predictions_hour_13[predictions_hour_13 < 0] <- 0

predictions_hour_13_df <- data.frame(
  date = forecast_data_hour_13$date,  
  predicted = predictions_hour_13
)

merged_data <- merge(hour_13_data, predictions_hour_13_df, by = "date")

output_data <- merged_data %>%
  select(date, actual = production, predicted)

output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

print(paste("WMAPE:", wmape, "%"))

output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

file_path <- file.path(output_dir, "predictions_hour_13_with_actuals.xlsx")

write_xlsx(output_data, file_path)

print(output_data)
[1] "WMAPE: 23.9313334909365 %"
Key: <date>
           date actual predicted     error
         <IDat>  <num>     <num>     <num>
  1: 2024-02-01   4.90  8.903164 4.0031635
  2: 2024-02-02   9.85  9.521604 0.3283964
  3: 2024-02-03   8.02  6.644535 1.3754651
  4: 2024-02-04   9.00  7.239842 1.7601575
  5: 2024-02-05   3.01  5.773566 2.7635661
 ---                                      
101: 2024-05-11   3.67  8.253055 4.5830546
102: 2024-05-12   0.00  5.725042 5.7250422
103: 2024-05-13   4.04  5.886979 1.8469792
104: 2024-05-14   9.64  8.951888 0.6881117
105: 2024-05-15   7.18  8.784960 1.6049597
In [74]:
hour_14_data <- all_data[all_data$hour == 14, ]
hour_14_data <- hour_14_data[,-c(2)]
hour_14_data$trend_hour_14 <- 1:nrow(hour_14_data)
hour_14_data[, lag_14_production := shift(production,1)]
hour_14_data[,lag_14_diff:=production-lag_14_production]
# Remove rows with NA in lagged production to ensure the model can run
hour_14_data <- hour_14_data[!is.na(lag_14_production)]
lm_hour_14 <- lm(production ~ +lag_14_production+ special_period+dlwrf_surface+tmp_surface+is.weekend+is.ramadan+is.religousday+is.nationalday+is.publicholiday+hourly_cloud_average+month*hourly_max_t , data = hour_14_data)
summary(lm_hour_14)
checkresiduals(lm_hour_14)
plot(lm_hour_14)
Call:
lm(formula = production ~ +lag_14_production + special_period + 
    dlwrf_surface + tmp_surface + is.weekend + is.ramadan + is.religousday + 
    is.nationalday + is.publicholiday + hourly_cloud_average + 
    month * hourly_max_t, data = hour_14_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.4902 -1.0601  0.2299  1.2149  6.5299 

Coefficients: (1 not defined because of singularities)
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)           -49.229332  33.355821  -1.476  0.14035    
lag_14_production       0.107539   0.028577   3.763  0.00018 ***
special_period          0.407887   0.218537   1.866  0.06233 .  
dlwrf_surface          -0.025869   0.004248  -6.090 1.72e-09 ***
tmp_surface             0.208630   0.105379   1.980  0.04805 *  
is.weekend              3.323860   1.135289   2.928  0.00351 ** 
is.ramadan              0.207241   0.329407   0.629  0.52943    
is.religousday         -0.017660   0.432060  -0.041  0.96741    
is.nationalday          1.884202   0.783760   2.404  0.01643 *  
is.publicholiday       -3.533244   1.145636  -3.084  0.00211 ** 
hourly_cloud_average   -0.030869   0.005373  -5.746 1.28e-08 ***
monthAra               18.715850  38.743513   0.483  0.62917    
monthEki               51.540702  39.514496   1.304  0.19248    
monthEyl               31.333769  37.469047   0.836  0.40325    
monthHaz               -1.556742  39.088677  -0.040  0.96824    
monthKas               24.578372  35.946040   0.684  0.49432    
monthMar               43.898404  34.298011   1.280  0.20093    
monthMay               62.696698  35.725713   1.755  0.07964 .  
monthNis               27.908032  35.149466   0.794  0.42743    
monthOca               -5.164791  35.170800  -0.147  0.88329    
monthŞub               48.080570  34.544043   1.392  0.16434    
monthTem               55.405723  38.165297   1.452  0.14695    
hourly_max_t                  NA         NA      NA       NA    
monthAra:hourly_max_t  -0.062183   0.125821  -0.494  0.62128    
monthEki:hourly_max_t  -0.171335   0.126625  -1.353  0.17640    
monthEyl:hourly_max_t  -0.100101   0.118579  -0.844  0.39882    
monthHaz:hourly_max_t   0.008465   0.124695   0.068  0.94589    
monthKas:hourly_max_t  -0.081753   0.114721  -0.713  0.47628    
monthMar:hourly_max_t  -0.141418   0.108718  -1.301  0.19370    
monthMay:hourly_max_t  -0.204053   0.113489  -1.798  0.07254 .  
monthNis:hourly_max_t  -0.088926   0.111450  -0.798  0.42516    
monthOca:hourly_max_t   0.026587   0.112360   0.237  0.81301    
monthŞub:hourly_max_t  -0.157881   0.109816  -1.438  0.15090    
monthTem:hourly_max_t  -0.175299   0.120809  -1.451  0.14715    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.012 on 832 degrees of freedom
Multiple R-squared:  0.5738,	Adjusted R-squared:  0.5574 
F-statistic:    35 on 32 and 832 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 37

data:  Residuals
LM test = 44.803, df = 37, p-value = 0.1771
In [188]:
ggplot(hour_14_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 14",
       x = "Date",
       y = "Production") +
  theme_minimal()
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
In [144]:
hour_14_data <- all_data %>%
  filter(hour == 14, date >= start_date & date <= end_date)

forecast_data_hour_14 <- to_be_forecasted %>%
  filter(hour == 14)
forecast_data_hour_14$trend_hour_14 <- nrow(hour_14_data) + 1:nrow(forecast_data_hour_14)

last_production_value <- tail(hour_14_data$production, 1)
forecast_data_hour_14 <- forecast_data_hour_14 %>%
  mutate(lag_14_production = last_production_value)

predictions_hour_14 <- predict(lm_hour_14, newdata = forecast_data_hour_14)
predictions_hour_14[predictions_hour_14 < 0] <- 0

predictions_hour_14_df <- data.frame(
  date = forecast_data_hour_14$date,  
  predicted = predictions_hour_14
)

merged_data <- merge(hour_14_data, predictions_hour_14_df, by = "date")

output_data <- merged_data %>%
  select(date, actual = production, predicted)

output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

print(paste("WMAPE:", wmape, "%"))

output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

file_path <- file.path(output_dir, "predictions_hour_14_with_actuals.xlsx")

write_xlsx(output_data, file_path)

print(output_data)
[1] "WMAPE: 25.2414535555969 %"
Key: <date>
           date actual predicted     error
         <IDat>  <num>     <num>     <num>
  1: 2024-02-01   5.78  7.594635 1.8146355
  2: 2024-02-02   8.85  8.247988 0.6020120
  3: 2024-02-03   4.37  5.269883 0.8998832
  4: 2024-02-04   3.25  5.966098 2.7160984
  5: 2024-02-05   1.78  4.643374 2.8633739
 ---                                      
101: 2024-05-11   1.91  7.444803 5.5348028
102: 2024-05-12   0.00  5.197135 5.1971350
103: 2024-05-13   5.89  5.293584 0.5964157
104: 2024-05-14   9.70  8.215896 1.4841037
105: 2024-05-15   6.00  7.685520 1.6855205
In [75]:
hour_15_data <- all_data[all_data$hour == 15, ]
hour_15_data <- hour_15_data[,-c(2)]
hour_15_data$trend_hour_15 <- 1:nrow(hour_15_data)
hour_15_data[, lag_15_production := shift(production,1)]
hour_15_data[,lag_15_diff:=production-lag_15_production]
# Remove rows with NA in lagged production to ensure the model can run
hour_15_data <- hour_15_data[!is.na(lag_15_production)]
lm_hour_15 <- lm(production ~+lag_15_production+trend_hour_15+special_period+dswrf_surface+uswrf_top_of_atmosphere+dlwrf_surface+hourly_cloud_average+tmp_surface+is.weekend+is.ramadan+is.publicholiday +month*hourly_max_t , data = hour_15_data)
summary(lm_hour_15)
checkresiduals(lm_hour_15)
plot(lm_hour_15)
Call:
lm(formula = production ~ +lag_15_production + trend_hour_15 + 
    special_period + dswrf_surface + uswrf_top_of_atmosphere + 
    dlwrf_surface + hourly_cloud_average + tmp_surface + is.weekend + 
    is.ramadan + is.publicholiday + month * hourly_max_t, data = hour_15_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.2016 -0.8674  0.0785  1.0296  6.1278 

Coefficients: (1 not defined because of singularities)
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)             -3.027e+01  2.290e+01  -1.322 0.186554    
lag_15_production        1.126e-01  2.954e-02   3.812 0.000148 ***
trend_hour_15           -9.099e-04  3.005e-04  -3.028 0.002542 ** 
special_period           4.393e-01  2.064e-01   2.128 0.033592 *  
dswrf_surface            2.794e-03  1.325e-03   2.108 0.035292 *  
uswrf_top_of_atmosphere  9.910e-03  1.994e-03   4.970 8.15e-07 ***
dlwrf_surface           -2.490e-02  5.054e-03  -4.926 1.01e-06 ***
hourly_cloud_average    -2.727e-02  4.696e-03  -5.806 9.08e-09 ***
tmp_surface              1.325e-01  7.381e-02   1.795 0.073075 .  
is.weekend               1.330e+00  6.940e-01   1.916 0.055651 .  
is.ramadan               4.968e-01  2.717e-01   1.828 0.067875 .  
is.publicholiday        -1.619e+00  6.894e-01  -2.348 0.019120 *  
monthAra                 5.860e+00  2.896e+01   0.202 0.839681    
monthEki                 7.430e+00  2.907e+01   0.256 0.798293    
monthEyl                -1.439e+01  2.749e+01  -0.524 0.600686    
monthHaz                -5.492e+01  2.830e+01  -1.940 0.052663 .  
monthKas                 1.529e+01  2.573e+01   0.594 0.552418    
monthMar                -2.894e+00  2.435e+01  -0.119 0.905416    
monthMay                -9.822e+00  2.627e+01  -0.374 0.708613    
monthNis                -9.130e+00  2.481e+01  -0.368 0.712980    
monthOca                -4.429e+01  2.542e+01  -1.743 0.081788 .  
monthŞub                -1.830e+01  2.448e+01  -0.747 0.455068    
monthTem                -5.457e+00  2.763e+01  -0.197 0.843492    
hourly_max_t                    NA         NA      NA       NA    
monthAra:hourly_max_t   -2.336e-02  9.604e-02  -0.243 0.807928    
monthEki:hourly_max_t   -2.945e-02  9.430e-02  -0.312 0.754848    
monthEyl:hourly_max_t    4.475e-02  8.781e-02   0.510 0.610475    
monthHaz:hourly_max_t    1.811e-01  9.127e-02   1.984 0.047590 *  
monthKas:hourly_max_t   -5.652e-02  8.323e-02  -0.679 0.497289    
monthMar:hourly_max_t    9.090e-03  7.828e-02   0.116 0.907586    
monthMay:hourly_max_t    3.347e-02  8.465e-02   0.395 0.692666    
monthNis:hourly_max_t    2.897e-02  7.948e-02   0.364 0.715603    
monthOca:hourly_max_t    1.582e-01  8.305e-02   1.905 0.057099 .  
monthŞub:hourly_max_t    6.563e-02  7.907e-02   0.830 0.406755    
monthTem:hourly_max_t    1.707e-02  8.818e-02   0.194 0.846550    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.65 on 830 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.6737,	Adjusted R-squared:  0.6608 
F-statistic: 51.94 on 33 and 830 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 38

data:  Residuals
LM test = 50.289, df = 38, p-value = 0.08757
In [189]:
ggplot(hour_15_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 15",
       x = "Date",
       y = "Production") +
  theme_minimal()
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
In [143]:
# Filter the actual data for hour 15 and the specific date range
hour_15_data <- all_data %>%
  filter(hour == 15, date >= start_date & date <= end_date)

# Filter the to_be_forecasted data for hour 15
forecast_data_hour_15 <- to_be_forecasted %>%
  filter(hour == 15)

# Add the trend variable for hour 15
forecast_data_hour_15$trend_hour_15 <- nrow(hour_15_data) + 1:nrow(forecast_data_hour_15)

# Add the last known production value from hour_15_data to forecast_data_hour_15 for creating the lagged variable
last_production_value <- tail(hour_15_data$production, 1)
forecast_data_hour_15 <- forecast_data_hour_15 %>%
  mutate(lag_15_production = last_production_value)

# Make predictions
predictions_hour_15 <- predict(lm_hour_15, newdata = forecast_data_hour_15)
predictions_hour_15[predictions_hour_15 < 0] <- 0

# Create a dataframe with predictions and corresponding dates
predictions_hour_15_df <- data.frame(
  date = forecast_data_hour_15$date,  # Assuming 'date' column exists in forecast_data_hour_15
  predicted = predictions_hour_15
)

# Merge with actual values
merged_data <- merge(hour_15_data, predictions_hour_15_df, by = "date")

# Select and rename the columns to keep
output_data <- merged_data %>%
  select(date, actual = production, predicted)

# Calculate WMAPE
output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

# Print WMAPE
print(paste("WMAPE:", wmape, "%"))

# Ensure the directory exists
output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

# Specify the file path
file_path <- file.path(output_dir, "predictions_hour_15_with_actuals.xlsx")

# Write the combined data to an Excel file
write_xlsx(output_data, file_path)

# Print the combined data
print(output_data)
[1] "WMAPE: 31.6315801980676 %"
Key: <date>
           date actual predicted     error
         <IDat>  <num>     <num>     <num>
  1: 2024-02-01   4.78  4.321723 0.4582772
  2: 2024-02-02   5.64  4.932761 0.7072392
  3: 2024-02-03   3.24  2.428159 0.8118405
  4: 2024-02-04   3.71  2.997038 0.7129616
  5: 2024-02-05   1.88  2.282787 0.4027869
 ---                                      
101: 2024-05-11   1.24  5.956781 4.7167814
102: 2024-05-12   0.00  4.214975 4.2149754
103: 2024-05-13   5.32  4.455304 0.8646957
104: 2024-05-14   7.82  6.907217 0.9127826
105: 2024-05-15   3.11  6.406795 3.2967952
In [79]:
hour_16_data <- all_data[all_data$hour == 16, ]
hour_16_data <- hour_16_data[,-c(2)]
hour_16_data$trend_hour_16 <- 1:nrow(hour_16_data)
hour_16_data[, lag_16_production := shift(production,1)]
hour_16_data[,lag_16_diff:=production-lag_16_production]
# Remove rows with NA in lagged production to ensure the model can run
hour_16_data <- hour_16_data[!is.na(lag_16_production)]
lm_hour_16 <- lm(production ~+lag_16_production+trend_hour_16+ special_period+dswrf_surface+uswrf_top_of_atmosphere+hourly_cloud_average+is.ramadan+month*hourly_max_t, data = hour_16_data)
summary(lm_hour_16)
checkresiduals(lm_hour_16)
plot(lm_hour_16)
Call:
lm(formula = production ~ +lag_16_production + trend_hour_16 + 
    special_period + dswrf_surface + uswrf_top_of_atmosphere + 
    hourly_cloud_average + is.ramadan + month * hourly_max_t, 
    data = hour_16_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.2282 -0.6438 -0.0160  0.5996  8.0625 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)              2.935e+00  1.834e+01   0.160 0.872886    
lag_16_production        2.737e-01  3.176e-02   8.616  < 2e-16 ***
trend_hour_16           -8.723e-04  2.400e-04  -3.635 0.000295 ***
special_period           5.022e-01  1.667e-01   3.012 0.002674 ** 
dswrf_surface            5.930e-03  1.040e-03   5.704 1.62e-08 ***
uswrf_top_of_atmosphere  4.940e-03  1.611e-03   3.066 0.002240 ** 
hourly_cloud_average    -7.908e-03  3.141e-03  -2.517 0.012012 *  
is.ramadan               5.014e-01  2.192e-01   2.287 0.022432 *  
monthAra                -1.212e+01  2.413e+01  -0.502 0.615601    
monthEki                 1.546e+01  2.343e+01   0.660 0.509576    
monthEyl                -1.404e+01  2.254e+01  -0.623 0.533494    
monthHaz                -3.036e+01  2.231e+01  -1.361 0.173812    
monthKas                 1.182e+01  2.110e+01   0.560 0.575421    
monthMar                -1.229e+01  1.938e+01  -0.634 0.526199    
monthMay                -2.709e+00  2.052e+01  -0.132 0.894991    
monthNis                -1.475e+01  1.961e+01  -0.752 0.452165    
monthOca                -3.380e+01  2.039e+01  -1.658 0.097778 .  
monthŞub                -2.585e+01  1.950e+01  -1.325 0.185439    
monthTem                -2.035e+01  2.256e+01  -0.902 0.367252    
hourly_max_t            -1.297e-02  5.896e-02  -0.220 0.825912    
monthAra:hourly_max_t    4.147e-02  8.127e-02   0.510 0.609986    
monthEki:hourly_max_t   -5.615e-02  7.697e-02  -0.730 0.465899    
monthEyl:hourly_max_t    4.373e-02  7.291e-02   0.600 0.548823    
monthHaz:hourly_max_t    9.828e-02  7.265e-02   1.353 0.176480    
monthKas:hourly_max_t   -4.340e-02  6.926e-02  -0.627 0.531106    
monthMar:hourly_max_t    3.834e-02  6.301e-02   0.608 0.543040    
monthMay:hourly_max_t    7.239e-03  6.681e-02   0.108 0.913744    
monthNis:hourly_max_t    4.693e-02  6.348e-02   0.739 0.459948    
monthOca:hourly_max_t    1.189e-01  6.741e-02   1.764 0.078062 .  
monthŞub:hourly_max_t    8.877e-02  6.368e-02   1.394 0.163703    
monthTem:hourly_max_t    6.380e-02  7.276e-02   0.877 0.380820    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.33 on 834 degrees of freedom
Multiple R-squared:  0.6828,	Adjusted R-squared:  0.6714 
F-statistic: 59.84 on 30 and 834 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 34

data:  Residuals
LM test = 76.913, df = 34, p-value = 3.623e-05
In [190]:
ggplot(hour_16_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 16",
       x = "Date",
       y = "Production") +
  theme_minimal()
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
In [142]:
# Filter the actual data for hour 16 and the specific date range
hour_16_data <- all_data %>%
  filter(hour == 16, date >= start_date & date <= end_date)

# Filter the to_be_forecasted data for hour 16
forecast_data_hour_16 <- to_be_forecasted %>%
  filter(hour == 16)

# Add the trend variable for hour 16
forecast_data_hour_16$trend_hour_16 <- nrow(hour_16_data) + 1:nrow(forecast_data_hour_16)

# Add the last known production value from hour_16_data to forecast_data_hour_16 for creating the lagged variable
last_production_value <- tail(hour_16_data$production, 1)
forecast_data_hour_16 <- forecast_data_hour_16 %>%
  mutate(lag_16_production = last_production_value)

# Make predictions
predictions_hour_16 <- predict(lm_hour_16, newdata = forecast_data_hour_16)
predictions_hour_16[predictions_hour_16 < 0] <- 0

# Create a dataframe with predictions and corresponding dates
predictions_hour_16_df <- data.frame(
  date = forecast_data_hour_16$date,  # Assuming 'date' column exists in forecast_data_hour_16
  predicted = predictions_hour_16
)

# Merge with actual values
merged_data <- merge(hour_16_data, predictions_hour_16_df, by = "date")

# Select and rename the columns to keep
output_data <- merged_data %>%
  select(date, actual = production, predicted)

# Calculate WMAPE
output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

# Print WMAPE
print(paste("WMAPE:", wmape, "%"))

# Ensure the directory exists
output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

# Specify the file path
file_path <- file.path(output_dir, "predictions_hour_16_with_actuals.xlsx")

# Write the combined data to an Excel file
write_xlsx(output_data, file_path)

# Print the combined data
print(output_data)
[1] "WMAPE: 39.6096503905389 %"
Key: <date>
           date actual predicted     error
         <IDat>  <num>     <num>     <num>
  1: 2024-02-01   0.53 0.9404273 0.4104273
  2: 2024-02-02   1.53 1.1416612 0.3883388
  3: 2024-02-03   0.94 0.3487044 0.5912956
  4: 2024-02-04   0.66 0.5928544 0.0671456
  5: 2024-02-05   0.00 0.4192077 0.4192077
 ---                                      
101: 2024-05-11   0.78 2.9014597 2.1214597
102: 2024-05-12   0.00 2.3909400 2.3909400
103: 2024-05-13   3.62 2.3220644 1.2979356
104: 2024-05-14   4.55 3.7036654 0.8463346
105: 2024-05-15   0.00 3.0294696 3.0294696
In [200]:
hour_17_data <- all_data[all_data$hour == 17, ]
hour_17_data <- hour_17_data[,-c(2)]
hour_17_data$trend_hour_17 <- 1:nrow(hour_17_data)
hour_17_data$nm <- as.numeric(format(hour_17_data$date, "%m"))
# Assuming your date column is named 'date_column'
hour_17_data$is_not_winter <- as.numeric(!hour_17_data$nm %in% c(12, 1, 2))
hour_17_data <- as.data.table(hour_17_data)
#hour_17_data <- hour_17_data[production > 0]
#hour_17_data[, log_production := log(production)]
hour_17_data[, lag_17_production := shift(production,1)]
hour_17_data[,lag_17_diff:=production-lag_17_production]
# Remove rows with NA in lagged production to ensure the model can run
hour_17_data <- hour_17_data[!is.na(lag_17_production)]
hour_17_data <- hour_17_data[!is.na(lag_17_diff)]
lm_hour_17 <- lm(production ~+lag_17_production+trend_hour_17+special_period+is.ramadan+is.religousday+uswrf_surface+uswrf_top_of_atmosphere+dswrf_surface+month*hourly_max_t, data = hour_17_data)
summary(lm_hour_17)
checkresiduals(lm_hour_17)
plot(lm_hour_17)
Call:
lm(formula = production ~ +lag_17_production + trend_hour_17 + 
    special_period + is.ramadan + is.religousday + uswrf_surface + 
    uswrf_top_of_atmosphere + dswrf_surface + month * hourly_max_t, 
    data = hour_17_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.2207 -0.2810 -0.0078  0.1988  4.0676 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)             -2.0813716 11.2734156  -0.185   0.8536    
lag_17_production        0.4986512  0.0302889  16.463  < 2e-16 ***
trend_hour_17           -0.0007052  0.0001477  -4.774 2.13e-06 ***
special_period           0.2365340  0.0964244   2.453   0.0144 *  
is.ramadan               0.2787440  0.1268847   2.197   0.0283 *  
is.religousday          -0.1523958  0.1673489  -0.911   0.3627    
uswrf_surface           -0.0044092  0.0026311  -1.676   0.0941 .  
uswrf_top_of_atmosphere  0.0012367  0.0011655   1.061   0.2890    
dswrf_surface            0.0030840  0.0012708   2.427   0.0154 *  
monthAra                -0.2137688 14.5499366  -0.015   0.9883    
monthEki                 2.5256493 14.4314141   0.175   0.8611    
monthEyl                 8.8020038 13.7692349   0.639   0.5228    
monthHaz                11.8663195 13.8225292   0.858   0.3909    
monthKas                 6.1699317 13.0493808   0.473   0.6365    
monthMar                 7.6478407 12.1969094   0.627   0.5308    
monthMay                11.6937865 12.8830519   0.908   0.3643    
monthNis                -0.2445660 12.1437403  -0.020   0.9839    
monthOca                -1.0584106 12.3161696  -0.086   0.9315    
monthŞub                 5.8132312 12.3323010   0.471   0.6375    
monthTem                -8.7422014 14.4185303  -0.606   0.5445    
hourly_max_t             0.0062948  0.0369846   0.170   0.8649    
monthAra:hourly_max_t    0.0017844  0.0495986   0.036   0.9713    
monthEki:hourly_max_t   -0.0097040  0.0480400  -0.202   0.8400    
monthEyl:hourly_max_t   -0.0308600  0.0451584  -0.683   0.4946    
monthHaz:hourly_max_t   -0.0409155  0.0454998  -0.899   0.3688    
monthKas:hourly_max_t   -0.0215024  0.0435056  -0.494   0.6213    
monthMar:hourly_max_t   -0.0283603  0.0403228  -0.703   0.4820    
monthMay:hourly_max_t   -0.0400826  0.0424810  -0.944   0.3457    
monthNis:hourly_max_t   -0.0001357  0.0398308  -0.003   0.9973    
monthOca:hourly_max_t    0.0042546  0.0410731   0.104   0.9175    
monthŞub:hourly_max_t   -0.0211684  0.0409375  -0.517   0.6052    
monthTem:hourly_max_t    0.0273820  0.0470553   0.582   0.5608    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.7635 on 833 degrees of freedom
Multiple R-squared:  0.6759,	Adjusted R-squared:  0.6638 
F-statistic: 56.03 on 31 and 833 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 35

data:  Residuals
LM test = 147.01, df = 35, p-value = 1.113e-15
In [202]:
ggplot(hour_17_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 17",
       x = "Date",
       y = "Production") +
  theme_minimal()
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”

Log transformation for hour 17 was considered. However, we have decided not to use it since it requires elimination of zero values and disrupts continuity of our data.

In [201]:
# Filter the actual data for hour 17 and the specific date range
hour_17_data <- all_data %>%
  filter(hour == 17, date >= start_date & date <= end_date)

# Filter the to_be_forecasted data for hour 17
forecast_data_hour_17 <- to_be_forecasted %>%
  filter(hour == 17)

# Add the trend variable for hour 17
forecast_data_hour_17$trend_hour_17 <- nrow(hour_17_data) + 1:nrow(forecast_data_hour_17)

# Add the last known production value from hour_17_data to forecast_data_hour_17 for creating the lagged variable
last_production_value <- tail(hour_17_data$production, 1)
forecast_data_hour_17 <- forecast_data_hour_17 %>%
  mutate(lag_17_production = last_production_value)

# Make predictions
predictions_hour_17 <- predict(lm_hour_17, newdata = forecast_data_hour_17)
predictions_hour_17[predictions_hour_17 < 0] <- 0

# Create a dataframe with predictions and corresponding dates
predictions_hour_17_df <- data.frame(
  date = forecast_data_hour_17$date,  # Assuming 'date' column exists in forecast_data_hour_17
  predicted = predictions_hour_17
)

# Merge with actual values
merged_data <- merge(hour_17_data, predictions_hour_17_df, by = "date")

# Select and rename the columns to keep
output_data <- merged_data %>%
  select(date, actual = production, predicted)

# Calculate WMAPE
output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

# Print WMAPE
print(paste("WMAPE:", wmape, "%"))

# Ensure the directory exists
output_dir <- "C:/Users/ecemozturk/Desktop"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

# Specify the file path
file_path <- file.path(output_dir, "predictions_hour_17_with_actuals.xlsx")

# Write the combined data to an Excel file
write_xlsx(output_data, file_path)

# Print the combined data
print(output_data)
[1] "WMAPE: 61.689983094304 %"
Key: <date>
           date actual predicted     error
         <IDat>  <num>     <num>     <num>
  1: 2024-02-01   0.00 0.3053737 0.3053737
  2: 2024-02-02   0.00 0.3176936 0.3176936
  3: 2024-02-03   0.00 0.2818699 0.2818699
  4: 2024-02-04   0.00 0.2827902 0.2827902
  5: 2024-02-05   0.00 0.1942501 0.1942501
 ---                                      
101: 2024-05-11   0.30 1.0700304 0.7700304
102: 2024-05-12   0.00 1.0176547 1.0176547
103: 2024-05-13   1.30 1.0150746 0.2849254
104: 2024-05-14   1.33 1.3800042 0.0500042
105: 2024-05-15   0.00 1.1531298 1.1531298
In [94]:
hour_18_data <- all_data[all_data$hour == 18, ]
hour_18_data <- hour_18_data[,-c(2)]
# Assuming your dataframe is named 'data' and the production column is named 'production'
#data_18_filtered <- hour_18_data[hour_18_data$production != 0, ]
hour_18_data$trend_hour_18 <- 1:nrow(hour_18_data)
hour_18_data[, lag_18_production := shift(production,1)]
hour_18_data[,lag_18_diff:=production-lag_18_production]
# Remove rows with NA in lagged production to ensure the model can run
hour_18_data <- hour_18_data[!is.na(lag_18_production)]
lm_hour_18 <- lm(production ~+lag_18_production +trend_hour_18+special_period+tmp_surface+dswrf_surface+month*hourly_max_t  , data = hour_18_data)
summary(lm_hour_18)
checkresiduals(lm_hour_18)
plot(lm_hour_18)
Call:
lm(formula = production ~ +lag_18_production + trend_hour_18 + 
    special_period + tmp_surface + dswrf_surface + month * hourly_max_t, 
    data = hour_18_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.6744 -0.0467 -0.0040  0.0102  3.7526 

Coefficients: (1 not defined because of singularities)
                        Estimate Std. Error t value Pr(>|t|)   
(Intercept)           -1.370e+01  4.342e+00  -3.155  0.00166 **
lag_18_production      8.499e-02  3.446e-02   2.466  0.01385 * 
trend_hour_18          2.837e-05  4.623e-05   0.614  0.53957   
special_period         1.015e-01  3.311e-02   3.064  0.00226 **
tmp_surface            4.579e-02  1.437e-02   3.186  0.00149 **
dswrf_surface          2.093e-05  1.291e-04   0.162  0.87126   
monthAra               1.374e+01  5.274e+00   2.606  0.00933 **
monthEki               1.297e+01  5.486e+00   2.363  0.01834 * 
monthEyl               1.386e+01  5.222e+00   2.654  0.00811 **
monthHaz               1.621e+01  5.421e+00   2.991  0.00287 **
monthKas               1.366e+01  5.106e+00   2.676  0.00760 **
monthMar               1.392e+01  4.540e+00   3.066  0.00224 **
monthMay               1.224e+01  5.012e+00   2.442  0.01481 * 
monthNis               1.345e+01  4.690e+00   2.869  0.00422 **
monthOca               1.389e+01  4.602e+00   3.018  0.00262 **
monthŞub               1.385e+01  4.524e+00   3.062  0.00227 **
monthTem               7.246e+00  5.681e+00   1.275  0.20254   
hourly_max_t                  NA         NA      NA       NA   
monthAra:hourly_max_t -4.600e-02  1.800e-02  -2.555  0.01079 * 
monthEki:hourly_max_t -4.346e-02  1.853e-02  -2.345  0.01924 * 
monthEyl:hourly_max_t -4.656e-02  1.739e-02  -2.677  0.00758 **
monthHaz:hourly_max_t -5.379e-02  1.810e-02  -2.972  0.00304 **
monthKas:hourly_max_t -4.581e-02  1.727e-02  -2.653  0.00813 **
monthMar:hourly_max_t -4.664e-02  1.511e-02  -3.086  0.00209 **
monthMay:hourly_max_t -4.056e-02  1.676e-02  -2.420  0.01574 * 
monthNis:hourly_max_t -4.498e-02  1.561e-02  -2.881  0.00407 **
monthOca:hourly_max_t -4.653e-02  1.541e-02  -3.019  0.00261 **
monthŞub:hourly_max_t -4.641e-02  1.509e-02  -3.076  0.00217 **
monthTem:hourly_max_t -2.342e-02  1.884e-02  -1.243  0.21417   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2659 on 837 degrees of freedom
Multiple R-squared:  0.1977,	Adjusted R-squared:  0.1719 
F-statistic:  7.64 on 27 and 837 DF,  p-value: < 2.2e-16
	Breusch-Godfrey test for serial correlation of order up to 32

data:  Residuals
LM test = 207.6, df = 32, p-value < 2.2e-16
In [192]:
ggplot(hour_18_data, aes(x = date, y = production)) +
  geom_line() +
  labs(title = "Hourly Production Data for Hour 18",
       x = "Date",
       y = "Production") +
  theme_minimal()
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <c5>”
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
“conversion failure on 'Şub' in 'mbcsToSbcs': dot substituted for <9e>”
In [176]:
# Filter the actual data for hour 18 and the specific date range
hour_18_data <- all_data %>%
  filter(hour == 18, date >= start_date & date <= end_date)

# Filter the to_be_forecasted data for hour 18
forecast_data_hour_18 <- to_be_forecasted %>%
  filter(hour == 18)

# Add the trend variable for hour 18
forecast_data_hour_18$trend_hour_18 <- nrow(hour_18_data) + 1:nrow(forecast_data_hour_18)

# Add the last known production value from hour_18_data to forecast_data_hour_18 for creating the lagged variable
last_production_value <- tail(hour_18_data$production, 1)
forecast_data_hour_18 <- forecast_data_hour_18 %>%
  mutate(lag_18_production = last_production_value)

# Make predictions
predictions_hour_18 <- predict(lm_hour_18, newdata = forecast_data_hour_18)
predictions_hour_18[predictions_hour_18 < 0] <- 0

# Create a dataframe with predictions and corresponding dates
predictions_hour_18_df <- data.frame(
  date = forecast_data_hour_18$date,  # Assuming 'date' column exists in forecast_data_hour_18
  predicted = predictions_hour_18
)

# Merge with actual values
merged_data <- merge(hour_18_data, predictions_hour_18_df, by = "date")

# Select and rename the columns to keep
output_data <- merged_data %>%
  select(date, actual = production, predicted)

# Calculate WMAPE
output_data <- output_data %>%
  mutate(error = abs(predicted - actual))

wmape <- sum(output_data$error) / sum(output_data$actual) * 100

# Print WMAPE
print(paste("WMAPE:", wmape, "%"))

# Print the combined data
print(output_data)
[1] "WMAPE: 87.6958599729125 %"
Key: <date>
           date actual  predicted      error
         <IDat>  <num>      <num>      <num>
  1: 2024-02-01   0.00 0.00000000 0.00000000
  2: 2024-02-02   0.00 0.00000000 0.00000000
  3: 2024-02-03   0.00 0.00000000 0.00000000
  4: 2024-02-04   0.00 0.00000000 0.00000000
  5: 2024-02-05   0.00 0.00000000 0.00000000
 ---                                        
101: 2024-05-11   0.00 0.05853033 0.05853033
102: 2024-05-12   0.00 0.04082760 0.04082760
103: 2024-05-13   0.16 0.04495209 0.11504791
104: 2024-05-14   0.14 0.05715345 0.08284655
105: 2024-05-15   0.00 0.06877722 0.06877722

Conclusion for Linear Regression models: Especially for the hours in between 7 and 16, linear regression models and WMAPE values performs better than other hours mainly depending on the sun-set & sun-rise period. Although we tried to fit the production into a linear regression model, we still observe high WMAPE values in hours 4,5,6,17 and 18 because of having limited amount of data.